JAVA: HOW TO DEAL WITH THE BOM IN A UNICODE INPUTSTREAM

短命女 2022-05-27 11:46 196阅读 0赞

Ok, so I was happily reading CSV files from an SFTP server. The file content is returned as an InputStream and I I used a BufferedReader to read it line by line. Each line contained either a header or an order. The header lines started with the string “HDR”.

However, I suddenly discovered that my code was consistently skipping the first header (and as a result the orders belonging to it). The reason, I found, was simple. The first header, on the first line, didn’t start with “HDR”, it started with “□HDR”! And that undisplayable square turned out to be a Unicode Byte Order Mark (BOM).

To deal with the BOM, we can use a simple class I found at StackOverflow which handles it for us. Here it is

  1. import java.io.IOException;
  2. import java.io.InputStream;
  3. import java.io.PushbackInputStream;
  4. /**
  5. * The {@link UnicodeBOMInputStream} class wraps any
  6. * {@link InputStream} and detects the presence of any Unicode BOM
  7. * (Byte Order Mark) at its beginning, as defined by
  8. * <a href="http://www.faqs.org/rfcs/rfc3629.html">RFC 3629 - UTF-8, a transformation format of ISO 10646</a>
  9. *
  10. * <p>The
  11. * <a href="http://www.unicode.org/unicode/faq/utf_bom.html">Unicode FAQ</a>
  12. * defines 5 types of BOMs:<ul>
  13. * <li><pre>00 00 FE FF = UTF-32, big-endian</pre></li>
  14. * <li><pre>FF FE 00 00 = UTF-32, little-endian</pre></li>
  15. * <li><pre>FE FF = UTF-16, big-endian</pre></li>
  16. * <li><pre>FF FE = UTF-16, little-endian</pre></li>
  17. * <li><pre>EF BB BF = UTF-8</pre></li>
  18. * </ul></p>
  19. *
  20. * <p>Use the {@link #getBOM()} method to know whether a BOM has been detected
  21. * or not.
  22. * </p>
  23. * <p>Use the {@link #skipBOM()} method to remove the detected BOM from the
  24. * wrapped {@link InputStream} object.</p>
  25. *
  26. * @author Gregory Pakosz
  27. * @see http://stackoverflow.com/q/1835430/39321#1835529
  28. */
  29. class UnicodeBOMInputStream extends InputStream
  30. {
  31. /**
  32. * Type safe enumeration class that describes the different types of Unicode
  33. * BOMs.
  34. */
  35. public static final class BOM
  36. {
  37. /**
  38. * NONE.
  39. */
  40. public static final BOM NONE = new BOM(new byte[]{},"NONE");
  41. /**
  42. * UTF-8 BOM (EF BB BF).
  43. */
  44. public static final BOM UTF_8 = new BOM(new byte[]{(byte)0xEF,
  45. (byte)0xBB,
  46. (byte)0xBF},
  47. "UTF-8");
  48. /**
  49. * UTF-16, little-endian (FF FE).
  50. */
  51. public static final BOM UTF_16_LE = new BOM(new byte[]{ (byte)0xFF,
  52. (byte)0xFE},
  53. "UTF-16 little-endian");
  54. /**
  55. * UTF-16, big-endian (FE FF).
  56. */
  57. public static final BOM UTF_16_BE = new BOM(new byte[]{ (byte)0xFE,
  58. (byte)0xFF},
  59. "UTF-16 big-endian");
  60. /**
  61. * UTF-32, little-endian (FF FE 00 00).
  62. */
  63. public static final BOM UTF_32_LE = new BOM(new byte[]{ (byte)0xFF,
  64. (byte)0xFE,
  65. (byte)0x00,
  66. (byte)0x00},
  67. "UTF-32 little-endian");
  68. /**
  69. * UTF-32, big-endian (00 00 FE FF).
  70. */
  71. public static final BOM UTF_32_BE = new BOM(new byte[]{ (byte)0x00,
  72. (byte)0x00,
  73. (byte)0xFE,
  74. (byte)0xFF},
  75. "UTF-32 big-endian");
  76. /**
  77. * Returns a {@link String} representation of this {@link BOM}.
  78. * value.
  79. */
  80. public final String toString()
  81. {
  82. return description;
  83. }
  84. /**
  85. * Returns the bytes corresponding to this {@link BOM} value.
  86. */
  87. public final byte[] getBytes()
  88. {
  89. final int length = bytes.length;
  90. final byte[] result = new byte[length];
  91. // Make a defensive copy
  92. System.arraycopy(bytes,0,result,0,length);
  93. return result;
  94. }
  95. private BOM(final byte bom[], final String description)
  96. {
  97. this.bytes = bom;
  98. this.description = description;
  99. }
  100. final byte bytes[];
  101. private final String description;
  102. }
  103. /**
  104. * Constructs a new {@link UnicodeBOMInputStream} that wraps the
  105. * specified {@link InputStream}.
  106. *
  107. * @param inputStream an {@link InputStream}.
  108. *
  109. * @throws IOException on reading from the specified {@link InputStream}
  110. * when trying to detect the Unicode BOM.
  111. */
  112. public UnicodeBOMInputStream(final InputStream inputStream) throws IOException
  113. {
  114. in = new PushbackInputStream(inputStream,4);
  115. final byte bom[] = new byte[4];
  116. final int read = in.read(bom);
  117. switch(read)
  118. {
  119. case 4:
  120. if ((bom[0] == (byte)0xFF) &&
  121. (bom[1] == (byte)0xFE) &&
  122. (bom[2] == (byte)0x00) &&
  123. (bom[3] == (byte)0x00))
  124. {
  125. this.bom = BOM.UTF_32_LE;
  126. break;
  127. }
  128. else
  129. if ((bom[0] == (byte)0x00) &&
  130. (bom[1] == (byte)0x00) &&
  131. (bom[2] == (byte)0xFE) &&
  132. (bom[3] == (byte)0xFF))
  133. {
  134. this.bom = BOM.UTF_32_BE;
  135. break;
  136. }
  137. case 3:
  138. if ((bom[0] == (byte)0xEF) &&
  139. (bom[1] == (byte)0xBB) &&
  140. (bom[2] == (byte)0xBF))
  141. {
  142. this.bom = BOM.UTF_8;
  143. break;
  144. }
  145. case 2:
  146. if ((bom[0] == (byte)0xFF) &&
  147. (bom[1] == (byte)0xFE))
  148. {
  149. this.bom = BOM.UTF_16_LE;
  150. break;
  151. }
  152. else
  153. if ((bom[0] == (byte)0xFE) &&
  154. (bom[1] == (byte)0xFF))
  155. {
  156. this.bom = BOM.UTF_16_BE;
  157. break;
  158. }
  159. default:
  160. this.bom = BOM.NONE;
  161. break;
  162. }
  163. if (read > 0)
  164. in.unread(bom,0,read);
  165. }
  166. /**
  167. * Returns the {@link BOM} that was detected in the wrapped
  168. * {@link InputStream} object.
  169. *
  170. * @return a {@link BOM} value.
  171. */
  172. public final BOM getBOM()
  173. {
  174. // BOM type is immutable.
  175. return bom;
  176. }
  177. /**
  178. * Skips the {@link BOM} that was found in the wrapped
  179. * {@link InputStream} object.
  180. *
  181. * @return this {@link UnicodeBOMInputStream}.
  182. *
  183. * @throws IOException when trying to skip the BOM from the wrapped {@link InputStream} object.
  184. */
  185. public final synchronized UnicodeBOMInputStream skipBOM() throws IOException
  186. {
  187. if ( ! skipped)
  188. {
  189. in.skip(bom.bytes.length);
  190. skipped = true;
  191. }
  192. return this;
  193. }
  194. /**
  195. * {@inheritDoc}
  196. */
  197. public int read() throws IOException
  198. {
  199. return in.read();
  200. }
  201. /**
  202. * {@inheritDoc}
  203. */
  204. public int read(final byte b[]) throws IOException,
  205. NullPointerException
  206. {
  207. return in.read(b,0,b.length);
  208. }
  209. /**
  210. * {@inheritDoc}
  211. */
  212. public int read(final byte b[],
  213. final int off,
  214. final int len) throws IOException,
  215. NullPointerException
  216. {
  217. return in.read(b,off,len);
  218. }
  219. /**
  220. * {@inheritDoc}
  221. */
  222. public long skip(final long n) throws IOException
  223. {
  224. return in.skip(n);
  225. }
  226. /**
  227. * {@inheritDoc}
  228. */
  229. public int available() throws IOException
  230. {
  231. return in.available();
  232. }
  233. /**
  234. * {@inheritDoc}
  235. */
  236. public void close() throws IOException
  237. {
  238. in.close();
  239. }
  240. /**
  241. * {@inheritDoc}
  242. */
  243. public synchronized void mark(final int readlimit)
  244. {
  245. in.mark(readlimit);
  246. }
  247. /**
  248. * {@inheritDoc}
  249. */
  250. public synchronized void reset() throws IOException
  251. {
  252. in.reset();
  253. }
  254. /**
  255. * {@inheritDoc}
  256. */
  257. public boolean markSupported()
  258. {
  259. return in.markSupported();
  260. }
  261. private final PushbackInputStream in;
  262. private final BOM bom;
  263. private boolean skipped = false;
  264. }

Looks long, but it’s simple to use.

InputStream cleanStream = new UnicodeBOMInputStream (stream ). skipBOM ( ) ;

When reading the stream, remember to also use an InputStreamReader with the correct character set. For example something like this:

BufferedReader reader = new BufferedReader ( new InputStreamReader (cleanStream, “UTF-8” ) ) ;
String line = null ;
while ( (line = reader. readLine ( ) ) != null )
System. out. println (line ) ;
reader. close ( ) ;

If the file might be any of the Unicode kinds, you can probably use the getBOM method of the UnicodeBOMInputStream to choose the right one.

To sum up, this is tricky and annoying stuff! Please, if something in this post is wrong or inaccurate or should be done differently, please leave a comment. Want to get this right

原文地址: https://www.geekality.net/2011/05/23/java-how-to-deal-with-the-bom-in-a-unicode-inputstream/

发表评论

表情:
评论列表 (有 0 条评论,196人围观)

还没有评论,来说两句吧...

相关阅读