001package ball.io; 002/*- 003 * ########################################################################## 004 * Utilities 005 * %% 006 * Copyright (C) 2008 - 2022 Allen D. Ball 007 * %% 008 * Licensed under the Apache License, Version 2.0 (the "License"); 009 * you may not use this file except in compliance with the License. 010 * You may obtain a copy of the License at 011 * 012 * http://www.apache.org/licenses/LICENSE-2.0 013 * 014 * Unless required by applicable law or agreed to in writing, software 015 * distributed under the License is distributed on an "AS IS" BASIS, 016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 017 * See the License for the specific language governing permissions and 018 * limitations under the License. 019 * ########################################################################## 020 */ 021import java.io.File; 022import java.io.FileInputStream; 023import java.io.FileNotFoundException; 024import java.io.InputStream; 025import java.io.InputStreamReader; 026import java.io.LineNumberReader; 027import java.io.PushbackInputStream; 028import java.nio.charset.Charset; 029import java.util.Arrays; 030import java.util.Map; 031import java.util.Objects; 032import lombok.ToString; 033 034import static java.nio.charset.StandardCharsets.UTF_8; 035 036/** 037 * {@link java.io.BufferedReader} implementation which analyzes the 038 * underlying {@link InputStream} for byte order marks and selects the 039 * appropriate {@link Charset}. 040 * 041 * @see BOMCharsetMap 042 * 043 * @author {@link.uri mailto:ball@hcf.dev Allen D. Ball} 044 */ 045public class UnicodeReader extends LineNumberReader { 046 private static final Charset DEFAULT = UTF_8; 047 048 /** 049 * @param file The {@link File} to open. 050 * 051 * @throws FileNotFoundException 052 * If the {@link File} is not found. 053 */ 054 public UnicodeReader(File file) throws FileNotFoundException { 055 this(new FileInputStream(file)); 056 } 057 058 /** 059 * @param in The underlying {@link InputStream}. 060 */ 061 public UnicodeReader(InputStream in) { 062 this(in instanceof CharsetDetectInputStream 063 ? ((CharsetDetectInputStream) in) 064 : new CharsetDetectInputStream(in, DEFAULT)); 065 } 066 067 private UnicodeReader(CharsetDetectInputStream in) { 068 super(new InputStreamReader(in, in.getCharset())); 069 } 070 071 @Override 072 public String toString() { return super.toString(); } 073 074 @ToString 075 private static class CharsetDetectInputStream extends PushbackInputStream { 076 private final Charset charset; 077 078 public CharsetDetectInputStream(InputStream in, Charset charset) { 079 super(in, 8); 080 081 try { 082 for (Map.Entry<byte[],Charset> entry : BOMCharsetMap.INSTANCE.entrySet()) { 083 byte[] bytes = new byte[entry.getKey().length]; 084 int length = read(bytes); 085 086 if (length < 0) { 087 break; 088 } 089 090 if (bytes.length == length && Arrays.equals(bytes, entry.getKey())) { 091 charset = entry.getValue(); 092 break; 093 } else { 094 if (length > 0) { 095 unread(bytes, 0, length); 096 } 097 } 098 } 099 100 this.charset = Objects.requireNonNull(charset); 101 } catch (Exception exception) { 102 throw new ExceptionInInitializerError(exception); 103 } 104 } 105 106 public Charset getCharset() { return charset; } 107 } 108}