001package ball.io;
002/*-
003 * ##########################################################################
004 * Utilities
005 * %%
006 * Copyright (C) 2008 - 2022 Allen D. Ball
007 * %%
008 * Licensed under the Apache License, Version 2.0 (the "License");
009 * you may not use this file except in compliance with the License.
010 * You may obtain a copy of the License at
011 *
012 *      http://www.apache.org/licenses/LICENSE-2.0
013 *
014 * Unless required by applicable law or agreed to in writing, software
015 * distributed under the License is distributed on an "AS IS" BASIS,
016 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
017 * See the License for the specific language governing permissions and
018 * limitations under the License.
019 * ##########################################################################
020 */
021import java.io.File;
022import java.io.FileInputStream;
023import java.io.FileNotFoundException;
024import java.io.InputStream;
025import java.io.InputStreamReader;
026import java.io.LineNumberReader;
027import java.io.PushbackInputStream;
028import java.nio.charset.Charset;
029import java.util.Arrays;
030import java.util.Map;
031import java.util.Objects;
032import lombok.ToString;
033
034import static java.nio.charset.StandardCharsets.UTF_8;
035
036/**
037 * {@link java.io.BufferedReader} implementation which analyzes the
038 * underlying {@link InputStream} for byte order marks and selects the
039 * appropriate {@link Charset}.
040 *
041 * @see BOMCharsetMap
042 *
043 * @author {@link.uri mailto:ball@hcf.dev Allen D. Ball}
044 */
045public class UnicodeReader extends LineNumberReader {
046    private static final Charset DEFAULT = UTF_8;
047
048    /**
049     * @param   file            The {@link File} to open.
050     *
051     * @throws  FileNotFoundException
052     *                          If the {@link File} is not found.
053     */
054    public UnicodeReader(File file) throws FileNotFoundException {
055        this(new FileInputStream(file));
056    }
057
058    /**
059     * @param   in              The underlying {@link InputStream}.
060     */
061    public UnicodeReader(InputStream in) {
062        this(in instanceof CharsetDetectInputStream
063                 ? ((CharsetDetectInputStream) in)
064                 : new CharsetDetectInputStream(in, DEFAULT));
065    }
066
067    private UnicodeReader(CharsetDetectInputStream in) {
068        super(new InputStreamReader(in, in.getCharset()));
069    }
070
071    @Override
072    public String toString() { return super.toString(); }
073
074    @ToString
075    private static class CharsetDetectInputStream extends PushbackInputStream {
076        private final Charset charset;
077
078        public CharsetDetectInputStream(InputStream in, Charset charset) {
079            super(in, 8);
080
081            try {
082                for (Map.Entry<byte[],Charset> entry : BOMCharsetMap.INSTANCE.entrySet()) {
083                    byte[] bytes = new byte[entry.getKey().length];
084                    int length = read(bytes);
085
086                    if (length < 0) {
087                        break;
088                    }
089
090                    if (bytes.length == length && Arrays.equals(bytes, entry.getKey())) {
091                        charset = entry.getValue();
092                        break;
093                    } else {
094                        if (length > 0) {
095                            unread(bytes, 0, length);
096                        }
097                    }
098                }
099
100                this.charset = Objects.requireNonNull(charset);
101            } catch (Exception exception) {
102                throw new ExceptionInInitializerError(exception);
103            }
104        }
105
106        public Charset getCharset() { return charset; }
107    }
108}