java讀取txt文件解決亂碼問題
說明:由于txt文件有bom和不同的編碼方式,導致導入數據時產生亂碼,以下代碼完美解決亂碼問題。
參考他人代碼,結合自己的業務加工完成,費了大半天功夫完成,希望對大家有點用處。
廢話不多說,直接上代碼:
/**
* 從txt文件流讀取數據
*
* @param txtStream
* @return
* @throws IOException
*/
public static List<String> readFromTxt(InputStream txtStream) throws IOException {
List<String> paragraphList = new ArrayList<>();
LabelValuePair<InputStream, Charset> result = getStreamCharset(txtStream);
Charset cs = result.getValue();
BOMInputStream bomInputStream = new BOMInputStream(result.getLabel());
boolean hasBom = bomInputStream.hasBOM();
InputStreamReader sr = hasBom ?
new InputStreamReader(bomInputStream, Charset.forName(bomInputStream.getBOMCharsetName())) :
new InputStreamReader(bomInputStream, cs);
BufferedReader br = new BufferedReader(sr);
String line = null;
Integer lineIndex = 0;
while ((line = br.readLine()) != null) {
if (!hasBom && lineIndex == 0) {
lineIndex++;
if (StringUtils.isNotEmpty(line)) {
byte[] bts = line.getBytes(cs);
if ((bts[0] == -1 && bts[1] == -2) || bts[0] == -2 && bts[1] == -1) {
byte[] newBts = new byte[bts.length - 2];
for (int i = 2; i < bts.length; i++) {
newBts[i - 2] = bts[i];
}
line = new String(newBts, cs);
}
}
}
if (StringUtils.isNotEmpty(line) && StringUtils.isNotEmpty(line.trim())) {
paragraphList.add(line);
log.info("讀取數據:{},長度:{},value:{}", line, line.trim().length(), line.getBytes(cs));
}
}
br.close();
sr.close();
return paragraphList;
}
/**
* 判斷獲取字節流 編碼格式,主要用于txt文件內容讀取
* 再次讀取流,使用返回結果中的流
*
* @param stream
* @return
*/
public static LabelValuePair<InputStream, Charset> getStreamCharset(InputStream stream) throws IOException {
LabelValuePair<InputStream, byte[]> result = readSteam(stream, true);
byte[] buffer = result.getValue();
if (buffer.length < 2)
return new LabelValuePair<>(result.getLabel(), CharsetKit.CHARSET_GBK);
String encode = getFileCharSet(new BufferedInputStream(new ByteArrayInputStream(result.getValue())));// getBytesCharset(buffer);
return new LabelValuePair<>(result.getLabel(), CharsetKit.charset(encode));
}
/**
* 判斷txt編碼格式方法
*
* @param bis
* @return
*/
public static String getFileCharSet(BufferedInputStream bis) {
String charset = "GBK";
byte[] first3Bytes = new byte[3];
try {
boolean checked = false;
bis.mark(0);
int read = bis.read(first3Bytes, 0, 3);
if (read == -1) {
return charset; //文件編碼為 ANSI
} else if (first3Bytes[0] == (byte) 0xFF
&& first3Bytes[1] == (byte) 0xFE) {
charset = "UTF-16LE"; //文件編碼為 Unicode
checked = true;
} else if (first3Bytes[0] == (byte) 0xFE
&& first3Bytes[1] == (byte) 0xFF) {
charset = "UTF-16BE"; //文件編碼為 Unicode big endian
checked = true;
} else if (first3Bytes[0] == (byte) 0xEF
&& first3Bytes[1] == (byte) 0xBB
&& first3Bytes[2] == (byte) 0xBF) {
charset = "UTF-8"; //文件編碼為 UTF-8
checked = true;
}
bis.reset();
if (!checked) {
int loc = 0;
while ((read = bis.read()) != -1) {
loc++;
if (read >= 0xF0)
break;
if (0x80 <= read && read <= 0xBF) // 單獨出現BF以下的,也算是GBK
break;
if (0xC0 <= read && read <= 0xDF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) // 雙字節 (0xC0 - 0xDF)
// (0x80
// - 0xBF),也可能在GB編碼內
continue;
else
break;
} else if (0xE0 <= read && read <= 0xEF) {// 也有可能出錯,但是幾率較小
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
read = bis.read();
if (0x80 <= read && read <= 0xBF) {
charset = "UTF-8";
break;
} else
break;
} else
break;
}
}
}
bis.close();
} catch (Exception e) {
log.error("獲取文件編碼方式異常", e);
}
return charset;
}
/**
* 讀取流
*
* @param inputStream 輸入流
* @param isRepeat 是否重復讀取
* @return
*/
public static LabelValuePair<InputStream, byte[]> readSteam(InputStream inputStream, boolean isRepeat) throws IOException {
ByteArrayOutputStream outSteam = new ByteArrayOutputStream();
byte[] buffer = new byte[1024];
int len = -1;
inputStream.mark(0);
while ((len = inputStream.read(buffer)) != -1) {
outSteam.write(buffer);
}
byte[] fs = outSteam.toByteArray();
outSteam.close();
inputStream.close();
InputStream newSteam = null;
if (isRepeat) {
newSteam = new ByteArrayInputStream(fs);
}
return new LabelValuePair<>(newSteam, fs);
}
更新:以上代碼對流的復制有問題,請使用 byte[] fs = IOUtils.toByteArray(inputStream);

浙公網安備 33010602011771號