java讀取txt文件解決亂碼問題

說明：由于txt文件有bom和不同的編碼方式，導致導入數據時產生亂碼，以下代碼完美解決亂碼問題。
參考他人代碼，結合自己的業務加工完成，費了大半天功夫完成，希望對大家有點用處。
廢話不多說，直接上代碼：
 /**
     * 從txt文件流讀取數據
     *
     * @param txtStream
     * @return
     * @throws IOException
     */
    public static List<String> readFromTxt(InputStream txtStream) throws IOException {
        List<String> paragraphList = new ArrayList<>();
        LabelValuePair<InputStream, Charset> result = getStreamCharset(txtStream);
        Charset cs = result.getValue();
        BOMInputStream bomInputStream = new BOMInputStream(result.getLabel());
        boolean hasBom = bomInputStream.hasBOM();
        InputStreamReader sr = hasBom ?
                new InputStreamReader(bomInputStream, Charset.forName(bomInputStream.getBOMCharsetName())) :
                new InputStreamReader(bomInputStream, cs);
        BufferedReader br = new BufferedReader(sr);
        String line = null;
        Integer lineIndex = 0;
        while ((line = br.readLine()) != null) {
            if (!hasBom && lineIndex == 0) {
                lineIndex++;
                if (StringUtils.isNotEmpty(line)) {
                    byte[] bts = line.getBytes(cs);
                    if ((bts[0] == -1 && bts[1] == -2) || bts[0] == -2 && bts[1] == -1) {
                        byte[] newBts = new byte[bts.length - 2];
                        for (int i = 2; i < bts.length; i++) {
                            newBts[i - 2] = bts[i];
                        }
                        line = new String(newBts, cs);
                    }
                }
            }
            if (StringUtils.isNotEmpty(line) && StringUtils.isNotEmpty(line.trim())) {
                paragraphList.add(line);
                log.info("讀取數據：{}，長度：{}，value：{}", line, line.trim().length(), line.getBytes(cs));
            }
        }
        br.close();
        sr.close();
        return paragraphList;
    }

 /**
     * 判斷獲取字節流 編碼格式，主要用于txt文件內容讀取
     * 再次讀取流，使用返回結果中的流
     *
     * @param stream
     * @return
     */
    public static LabelValuePair<InputStream, Charset> getStreamCharset(InputStream stream) throws IOException {
        LabelValuePair<InputStream, byte[]> result = readSteam(stream, true);
        byte[] buffer = result.getValue();
        if (buffer.length < 2)
            return new LabelValuePair<>(result.getLabel(), CharsetKit.CHARSET_GBK);
        String encode = getFileCharSet(new BufferedInputStream(new ByteArrayInputStream(result.getValue())));// getBytesCharset(buffer);

        return new LabelValuePair<>(result.getLabel(), CharsetKit.charset(encode));
    }

  /**
     * 判斷txt編碼格式方法
     *
     * @param bis
     * @return
     */
    public static String getFileCharSet(BufferedInputStream bis) {
        String charset = "GBK";
        byte[] first3Bytes = new byte[3];
        try {
            boolean checked = false;
            bis.mark(0);
            int read = bis.read(first3Bytes, 0, 3);
            if (read == -1) {
                return charset; //文件編碼為 ANSI
            } else if (first3Bytes[0] == (byte) 0xFF
                    && first3Bytes[1] == (byte) 0xFE) {
                charset = "UTF-16LE"; //文件編碼為 Unicode
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xFE
                    && first3Bytes[1] == (byte) 0xFF) {
                charset = "UTF-16BE"; //文件編碼為 Unicode big endian
                checked = true;
            } else if (first3Bytes[0] == (byte) 0xEF
                    && first3Bytes[1] == (byte) 0xBB
                    && first3Bytes[2] == (byte) 0xBF) {
                charset = "UTF-8"; //文件編碼為 UTF-8
                checked = true;
            }
            bis.reset();
            if (!checked) {
                int loc = 0;
                while ((read = bis.read()) != -1) {
                    loc++;
                    if (read >= 0xF0)
                        break;
                    if (0x80 <= read && read <= 0xBF) // 單獨出現BF以下的，也算是GBK
                        break;
                    if (0xC0 <= read && read <= 0xDF) {
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) // 雙字節 (0xC0 - 0xDF)
                            // (0x80
                            // - 0xBF),也可能在GB編碼內
                            continue;
                        else
                            break;
                    } else if (0xE0 <= read && read <= 0xEF) {// 也有可能出錯，但是幾率較小
                        read = bis.read();
                        if (0x80 <= read && read <= 0xBF) {
                            read = bis.read();
                            if (0x80 <= read && read <= 0xBF) {
                                charset = "UTF-8";
                                break;
                            } else
                                break;
                        } else
                            break;
                    }
                }
            }
            bis.close();
        } catch (Exception e) {
            log.error("獲取文件編碼方式異常", e);
        }
        return charset;
    }

    /**
     * 讀取流
     *
     * @param inputStream 輸入流
     * @param isRepeat    是否重復讀取
     * @return
     */
    public static LabelValuePair<InputStream, byte[]> readSteam(InputStream inputStream, boolean isRepeat) throws IOException {
        ByteArrayOutputStream outSteam = new ByteArrayOutputStream();
        byte[] buffer = new byte[1024];
        int len = -1;
        inputStream.mark(0);
        while ((len = inputStream.read(buffer)) != -1) {
            outSteam.write(buffer);
        }
        byte[] fs = outSteam.toByteArray();
        outSteam.close();
        inputStream.close();
        InputStream newSteam = null;
        if (isRepeat) {
            newSteam = new ByteArrayInputStream(fs);
        }

        return new LabelValuePair<>(newSteam, fs);
    }
更新：以上代碼對流的復制有問題，請使用 byte[] fs = IOUtils.toByteArray(inputStream);
posted @ 2023-07-30 18:16 newbigapple 閱讀(956) 評論(0) 收藏舉報
刷新頁面返回頂部
javacoffeenet

java讀取txt文件解決亂碼問題

公告