001/* 002 * Licensed to the Apache Software Foundation (ASF) under one or more 003 * contributor license agreements. See the NOTICE file distributed with 004 * this work for additional information regarding copyright ownership. 005 * The ASF licenses this file to You under the Apache License, Version 2.0 006 * (the "License"); you may not use this file except in compliance with 007 * the License. You may obtain a copy of the License at 008 * 009 * http://www.apache.org/licenses/LICENSE-2.0 010 * 011 * Unless required by applicable law or agreed to in writing, software 012 * distributed under the License is distributed on an "AS IS" BASIS, 013 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 014 * See the License for the specific language governing permissions and 015 * limitations under the License. 016 */ 017package org.apache.commons.io.input; 018 019import java.io.BufferedInputStream; 020import java.io.BufferedReader; 021import java.io.File; 022import java.io.IOException; 023import java.io.InputStream; 024import java.io.InputStreamReader; 025import java.io.Reader; 026import java.io.StringReader; 027import java.net.HttpURLConnection; 028import java.net.URL; 029import java.net.URLConnection; 030import java.nio.charset.Charset; 031import java.nio.charset.StandardCharsets; 032import java.nio.file.Files; 033import java.nio.file.Path; 034import java.text.MessageFormat; 035import java.util.Locale; 036import java.util.Objects; 037import java.util.regex.Matcher; 038import java.util.regex.Pattern; 039 040import org.apache.commons.io.ByteOrderMark; 041import org.apache.commons.io.Charsets; 042import org.apache.commons.io.IOUtils; 043import org.apache.commons.io.build.AbstractStreamBuilder; 044import org.apache.commons.io.function.IOConsumer; 045import org.apache.commons.io.output.XmlStreamWriter; 046 047/** 048 * Character stream that handles all the necessary Voodoo to figure out the charset encoding of the XML document within the stream. 049 * <p> 050 * IMPORTANT: This class is not related in any way to the org.xml.sax.XMLReader. This one IS a character stream. 051 * </p> 052 * <p> 053 * All this has to be done without consuming characters from the stream, if not the XML parser will not recognized the document as a valid XML. This is not 100% 054 * true, but it's close enough (UTF-8 BOM is not handled by all parsers right now, XmlStreamReader handles it and things work in all parsers). 055 * </p> 056 * <p> 057 * The XmlStreamReader class handles the charset encoding of XML documents in Files, raw streams and HTTP streams by offering a wide set of constructors. 058 * </p> 059 * <p> 060 * By default the charset encoding detection is lenient, the constructor with the lenient flag can be used for a script (following HTTP MIME and XML 061 * specifications). All this is nicely explained by Mark Pilgrim in his blog, <a href="http://diveintomark.org/archives/2004/02/13/xml-media-types"> Determining 062 * the character encoding of a feed</a>. 063 * </p> 064 * <p> 065 * To build an instance, see {@link Builder}. 066 * </p> 067 * <p> 068 * Originally developed for <a href="http://rome.dev.java.net">ROME</a> under Apache License 2.0. 069 * </p> 070 * 071 * @see org.apache.commons.io.output.XmlStreamWriter 072 * @since 2.0 073 */ 074public class XmlStreamReader extends Reader { 075 076 /** 077 * Builds a new {@link XmlStreamWriter} instance. 078 * 079 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection. 080 * <p> 081 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 082 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 083 * </p> 084 * <p> 085 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 086 * </p> 087 * <p> 088 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 089 * </p> 090 * <p> 091 * Else if the XML prolog had a charset encoding that encoding is used. 092 * </p> 093 * <p> 094 * Else if the content type had a charset encoding that encoding is used. 095 * </p> 096 * <p> 097 * Else 'UTF-8' is used. 098 * </p> 099 * <p> 100 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 101 * </p> 102 * <p> 103 * For example: 104 * </p> 105 * 106 * <pre>{@code 107 * XmlStreamReader r = XmlStreamReader.builder().setPath(path).setCharset(StandardCharsets.UTF_8).get(); 108 * } 109 * </pre> 110 * 111 * @since 2.12.0 112 */ 113 public static class Builder extends AbstractStreamBuilder<XmlStreamReader, Builder> { 114 115 private boolean nullCharset = true; 116 private boolean lenient = true; 117 private String httpContentType; 118 119 /** 120 * Constructs a new instance. 121 * <p> 122 * This builder use the aspect InputStream, OpenOption[], httpContentType, lenient, and defaultEncoding. 123 * </p> 124 * <p> 125 * You must provide an origin that can be converted to an InputStream by this builder, otherwise, this call will throw an 126 * {@link UnsupportedOperationException}. 127 * </p> 128 * 129 * @return a new instance. 130 * @throws UnsupportedOperationException if the origin cannot provide an InputStream. 131 * @throws IOException thrown if there is a problem reading the stream. 132 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 133 * @see #getInputStream() 134 */ 135 @SuppressWarnings("resource") 136 @Override 137 public XmlStreamReader get() throws IOException { 138 final String defaultEncoding = nullCharset ? null : getCharset().name(); 139 // @formatter:off 140 return httpContentType == null 141 ? new XmlStreamReader(getInputStream(), lenient, defaultEncoding) 142 : new XmlStreamReader(getInputStream(), httpContentType, lenient, defaultEncoding); 143 // @formatter:on 144 } 145 146 @Override 147 public Builder setCharset(final Charset charset) { 148 nullCharset = charset == null; 149 return super.setCharset(charset); 150 } 151 152 @Override 153 public Builder setCharset(final String charset) { 154 nullCharset = charset == null; 155 return super.setCharset(Charsets.toCharset(charset, getCharsetDefault())); 156 } 157 158 /** 159 * Sets the HTTP content type. 160 * 161 * @param httpContentType the HTTP content type. 162 * @return this. 163 */ 164 public Builder setHttpContentType(final String httpContentType) { 165 this.httpContentType = httpContentType; 166 return this; 167 } 168 169 /** 170 * Sets the lenient toggle. 171 * 172 * @param lenient the lenient toggle. 173 * @return this. 174 */ 175 public Builder setLenient(final boolean lenient) { 176 this.lenient = lenient; 177 return this; 178 } 179 180 } 181 182 private static final String UTF_8 = StandardCharsets.UTF_8.name(); 183 184 private static final String US_ASCII = StandardCharsets.US_ASCII.name(); 185 186 private static final String UTF_16BE = StandardCharsets.UTF_16BE.name(); 187 188 private static final String UTF_16LE = StandardCharsets.UTF_16LE.name(); 189 190 private static final String UTF_32BE = "UTF-32BE"; 191 192 private static final String UTF_32LE = "UTF-32LE"; 193 194 private static final String UTF_16 = StandardCharsets.UTF_16.name(); 195 196 private static final String UTF_32 = "UTF-32"; 197 198 private static final String EBCDIC = "CP1047"; 199 200 private static final ByteOrderMark[] BOMS = { ByteOrderMark.UTF_8, ByteOrderMark.UTF_16BE, ByteOrderMark.UTF_16LE, ByteOrderMark.UTF_32BE, 201 ByteOrderMark.UTF_32LE }; 202 203 /** UTF_16LE and UTF_32LE have the same two starting BOM bytes. */ 204 private static final ByteOrderMark[] XML_GUESS_BYTES = { new ByteOrderMark(UTF_8, 0x3C, 0x3F, 0x78, 0x6D), 205 new ByteOrderMark(UTF_16BE, 0x00, 0x3C, 0x00, 0x3F), new ByteOrderMark(UTF_16LE, 0x3C, 0x00, 0x3F, 0x00), 206 new ByteOrderMark(UTF_32BE, 0x00, 0x00, 0x00, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D), 207 new ByteOrderMark(UTF_32LE, 0x3C, 0x00, 0x00, 0x00, 0x3F, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x6D, 0x00, 0x00, 0x00), 208 new ByteOrderMark(EBCDIC, 0x4C, 0x6F, 0xA7, 0x94) }; 209 210 private static final Pattern CHARSET_PATTERN = Pattern.compile("charset=[\"']?([.[^; \"']]*)[\"']?"); 211 212 /** 213 * Pattern capturing the encoding of the "xml" processing instruction. 214 * <p> 215 * See also the <a href="https://www.w3.org/TR/2008/REC-xml-20081126/#NT-EncName">XML specification</a>. 216 * </p> 217 */ 218 public static final Pattern ENCODING_PATTERN = Pattern.compile( 219 // @formatter:off 220 "^<\\?xml\\s+" 221 + "version\\s*=\\s*(?:(?:\"1\\.[0-9]+\")|(?:'1.[0-9]+'))\\s+" 222 + "encoding\\s*=\\s*" 223 + "((?:\"[A-Za-z0-9][A-Za-z0-9._+:-]*\")" // double-quoted 224 + "|(?:'[A-Za-z0-9][A-Za-z0-9._+:-]*'))", // single-quoted 225 Pattern.MULTILINE); 226 // N.B. the documented pattern is 227 // EncName ::= [A-Za-z] ([A-Za-z0-9._] | '-')* 228 // However this does not match all the aliases that are supported by Java. 229 // e.g. '437', 'ISO_8859-1:1987' and 'ebcdic-de-273+euro' 230 // @formatter:on 231 232 private static final String RAW_EX_1 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] encoding mismatch"; 233 234 private static final String RAW_EX_2 = "Illegal encoding, BOM [{0}] XML guess [{1}] XML prolog [{2}] unknown BOM"; 235 236 private static final String HTTP_EX_1 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], BOM must be null"; 237 238 private static final String HTTP_EX_2 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], encoding mismatch"; 239 240 private static final String HTTP_EX_3 = "Illegal encoding, CT-MIME [{0}] CT-Enc [{1}] BOM [{2}] XML guess [{3}] XML prolog [{4}], Illegal MIME"; 241 242 /** 243 * Constructs a new {@link Builder}. 244 * 245 * @return a new {@link Builder}. 246 * @since 2.12.0 247 */ 248 public static Builder builder() { 249 return new Builder(); 250 } 251 252 /** 253 * Gets the charset parameter value, {@code null} if not present, {@code null} if httpContentType is {@code null}. 254 * 255 * @param httpContentType the HTTP content type 256 * @return The content type encoding (upcased) 257 */ 258 static String getContentTypeEncoding(final String httpContentType) { 259 String encoding = null; 260 if (httpContentType != null) { 261 final int i = httpContentType.indexOf(";"); 262 if (i > -1) { 263 final String postMime = httpContentType.substring(i + 1); 264 final Matcher m = CHARSET_PATTERN.matcher(postMime); 265 encoding = m.find() ? m.group(1) : null; 266 encoding = encoding != null ? encoding.toUpperCase(Locale.ROOT) : null; 267 } 268 } 269 return encoding; 270 } 271 272 /** 273 * Gets the MIME type or {@code null} if httpContentType is {@code null}. 274 * 275 * @param httpContentType the HTTP content type 276 * @return The mime content type 277 */ 278 static String getContentTypeMime(final String httpContentType) { 279 String mime = null; 280 if (httpContentType != null) { 281 final int i = httpContentType.indexOf(";"); 282 if (i >= 0) { 283 mime = httpContentType.substring(0, i); 284 } else { 285 mime = httpContentType; 286 } 287 mime = mime.trim(); 288 } 289 return mime; 290 } 291 292 /** 293 * Gets the encoding declared in the <?xml encoding=...?>, {@code null} if none. 294 * 295 * @param inputStream InputStream to create the reader from. 296 * @param guessedEnc guessed encoding 297 * @return the encoding declared in the <?xml encoding=...?> 298 * @throws IOException thrown if there is a problem reading the stream. 299 */ 300 private static String getXmlProlog(final InputStream inputStream, final String guessedEnc) throws IOException { 301 String encoding = null; 302 if (guessedEnc != null) { 303 final byte[] bytes = IOUtils.byteArray(); 304 inputStream.mark(IOUtils.DEFAULT_BUFFER_SIZE); 305 int offset = 0; 306 int max = IOUtils.DEFAULT_BUFFER_SIZE; 307 int c = inputStream.read(bytes, offset, max); 308 int firstGT = -1; 309 String xmlProlog = ""; // avoid possible NPE warning (cannot happen; this just silences the warning) 310 while (c != -1 && firstGT == -1 && offset < IOUtils.DEFAULT_BUFFER_SIZE) { 311 offset += c; 312 max -= c; 313 c = inputStream.read(bytes, offset, max); 314 xmlProlog = new String(bytes, 0, offset, guessedEnc); 315 firstGT = xmlProlog.indexOf('>'); 316 } 317 if (firstGT == -1) { 318 if (c == -1) { 319 throw new IOException("Unexpected end of XML stream"); 320 } 321 throw new IOException("XML prolog or ROOT element not found on first " + offset + " bytes"); 322 } 323 final int bytesRead = offset; 324 if (bytesRead > 0) { 325 inputStream.reset(); 326 final BufferedReader bReader = new BufferedReader(new StringReader(xmlProlog.substring(0, firstGT + 1))); 327 final StringBuilder prolog = new StringBuilder(); 328 IOConsumer.forEach(bReader.lines(), prolog::append); 329 final Matcher m = ENCODING_PATTERN.matcher(prolog); 330 if (m.find()) { 331 encoding = m.group(1).toUpperCase(Locale.ROOT); 332 encoding = encoding.substring(1, encoding.length() - 1); 333 } 334 } 335 } 336 return encoding; 337 } 338 339 /** 340 * Tests if the MIME type belongs to the APPLICATION XML family. 341 * 342 * @param mime The mime type 343 * @return true if the mime type belongs to the APPLICATION XML family, otherwise false 344 */ 345 static boolean isAppXml(final String mime) { 346 return mime != null && (mime.equals("application/xml") || mime.equals("application/xml-dtd") || mime.equals("application/xml-external-parsed-entity") 347 || mime.startsWith("application/") && mime.endsWith("+xml")); 348 } 349 350 /** 351 * Tests if the MIME type belongs to the TEXT XML family. 352 * 353 * @param mime The mime type 354 * @return true if the mime type belongs to the TEXT XML family, otherwise false 355 */ 356 static boolean isTextXml(final String mime) { 357 return mime != null && (mime.equals("text/xml") || mime.equals("text/xml-external-parsed-entity") || mime.startsWith("text/") && mime.endsWith("+xml")); 358 } 359 360 private final Reader reader; 361 362 private final String encoding; 363 364 private final String defaultEncoding; 365 366 /** 367 * Constructs a Reader for a File. 368 * <p> 369 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8. 370 * </p> 371 * <p> 372 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 373 * </p> 374 * 375 * @param file File to create a Reader from. 376 * @throws NullPointerException if the input is {@code null}. 377 * @throws IOException thrown if there is a problem reading the file. 378 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 379 */ 380 @Deprecated 381 public XmlStreamReader(final File file) throws IOException { 382 this(Objects.requireNonNull(file, "file").toPath()); 383 } 384 385 /** 386 * Constructs a Reader for a raw InputStream. 387 * <p> 388 * It follows the same logic used for files. 389 * </p> 390 * <p> 391 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 392 * </p> 393 * 394 * @param inputStream InputStream to create a Reader from. 395 * @throws NullPointerException if the input stream is {@code null}. 396 * @throws IOException thrown if there is a problem reading the stream. 397 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 398 */ 399 @Deprecated 400 public XmlStreamReader(final InputStream inputStream) throws IOException { 401 this(inputStream, true); 402 } 403 404 /** 405 * Constructs a Reader for a raw InputStream. 406 * <p> 407 * It follows the same logic used for files. 408 * </p> 409 * <p> 410 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 411 * </p> 412 * <p> 413 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 414 * </p> 415 * <p> 416 * Else if the XML prolog had a charset encoding that encoding is used. 417 * </p> 418 * <p> 419 * Else if the content type had a charset encoding that encoding is used. 420 * </p> 421 * <p> 422 * Else 'UTF-8' is used. 423 * </p> 424 * <p> 425 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 426 * </p> 427 * 428 * @param inputStream InputStream to create a Reader from. 429 * @param lenient indicates if the charset encoding detection should be relaxed. 430 * @throws NullPointerException if the input stream is {@code null}. 431 * @throws IOException thrown if there is a problem reading the stream. 432 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 433 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 434 */ 435 @Deprecated 436 public XmlStreamReader(final InputStream inputStream, final boolean lenient) throws IOException { 437 this(inputStream, lenient, null); 438 } 439 440 /** 441 * Constructs a Reader for a raw InputStream. 442 * <p> 443 * It follows the same logic used for files. 444 * </p> 445 * <p> 446 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 447 * </p> 448 * <p> 449 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 450 * </p> 451 * <p> 452 * Else if the XML prolog had a charset encoding that encoding is used. 453 * </p> 454 * <p> 455 * Else if the content type had a charset encoding that encoding is used. 456 * </p> 457 * <p> 458 * Else 'UTF-8' is used. 459 * </p> 460 * <p> 461 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 462 * </p> 463 * 464 * @param inputStream InputStream to create a Reader from. 465 * @param lenient indicates if the charset encoding detection should be relaxed. 466 * @param defaultEncoding The default encoding 467 * @throws NullPointerException if the input stream is {@code null}. 468 * @throws IOException thrown if there is a problem reading the stream. 469 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 470 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 471 */ 472 @Deprecated 473 @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance. 474 public XmlStreamReader(final InputStream inputStream, final boolean lenient, final String defaultEncoding) throws IOException { 475 this.defaultEncoding = defaultEncoding; 476 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE), 477 false, BOMS); 478 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 479 this.encoding = processHttpStream(bom, pis, lenient); 480 this.reader = new InputStreamReader(pis, encoding); 481 } 482 483 /** 484 * Constructs a Reader using an InputStream and the associated content-type header. 485 * <p> 486 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 487 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 488 * </p> 489 * <p> 490 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 491 * </p> 492 * 493 * @param inputStream InputStream to create the reader from. 494 * @param httpContentType content-type header to use for the resolution of the charset encoding. 495 * @throws NullPointerException if the input stream is {@code null}. 496 * @throws IOException thrown if there is a problem reading the file. 497 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 498 */ 499 @Deprecated 500 public XmlStreamReader(final InputStream inputStream, final String httpContentType) throws IOException { 501 this(inputStream, httpContentType, true); 502 } 503 504 /** 505 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection. 506 * <p> 507 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 508 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 509 * </p> 510 * <p> 511 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 512 * </p> 513 * <p> 514 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 515 * </p> 516 * <p> 517 * Else if the XML prolog had a charset encoding that encoding is used. 518 * </p> 519 * <p> 520 * Else if the content type had a charset encoding that encoding is used. 521 * </p> 522 * <p> 523 * Else 'UTF-8' is used. 524 * </p> 525 * <p> 526 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 527 * </p> 528 * 529 * @param inputStream InputStream to create the reader from. 530 * @param httpContentType content-type header to use for the resolution of the charset encoding. 531 * @param lenient indicates if the charset encoding detection should be relaxed. 532 * @throws NullPointerException if the input stream is {@code null}. 533 * @throws IOException thrown if there is a problem reading the file. 534 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 535 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 536 */ 537 @Deprecated 538 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient) throws IOException { 539 this(inputStream, httpContentType, lenient, null); 540 } 541 542 /** 543 * Constructs a Reader using an InputStream and the associated content-type header. This constructor is lenient regarding the encoding detection. 544 * <p> 545 * First it checks if the stream has BOM. If there is not BOM checks the content-type encoding. If there is not content-type encoding checks the XML prolog 546 * encoding. If there is not XML prolog encoding uses the default encoding mandated by the content-type MIME type. 547 * </p> 548 * <p> 549 * If lenient detection is indicated and the detection above fails as per specifications it then attempts the following: 550 * </p> 551 * <p> 552 * If the content type was 'text/html' it replaces it with 'text/xml' and tries the detection again. 553 * </p> 554 * <p> 555 * Else if the XML prolog had a charset encoding that encoding is used. 556 * </p> 557 * <p> 558 * Else if the content type had a charset encoding that encoding is used. 559 * </p> 560 * <p> 561 * Else 'UTF-8' is used. 562 * </p> 563 * <p> 564 * If lenient detection is indicated an XmlStreamReaderException is never thrown. 565 * </p> 566 * 567 * @param inputStream InputStream to create the reader from. 568 * @param httpContentType content-type header to use for the resolution of the charset encoding. 569 * @param lenient indicates if the charset encoding detection should be relaxed. 570 * @param defaultEncoding The default encoding 571 * @throws NullPointerException if the input stream is {@code null}. 572 * @throws IOException thrown if there is a problem reading the file. 573 * @throws XmlStreamReaderException thrown if the charset encoding could not be determined according to the specification. 574 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 575 */ 576 @Deprecated 577 @SuppressWarnings("resource") // InputStream is managed through a InputStreamReader in this instance. 578 public XmlStreamReader(final InputStream inputStream, final String httpContentType, final boolean lenient, final String defaultEncoding) 579 throws IOException { 580 this.defaultEncoding = defaultEncoding; 581 final BOMInputStream bom = new BOMInputStream(new BufferedInputStream(Objects.requireNonNull(inputStream, "inputStream"), IOUtils.DEFAULT_BUFFER_SIZE), 582 false, BOMS); 583 final BOMInputStream pis = new BOMInputStream(bom, true, XML_GUESS_BYTES); 584 this.encoding = processHttpStream(bom, pis, lenient, httpContentType); 585 this.reader = new InputStreamReader(pis, encoding); 586 } 587 588 /** 589 * Constructs a Reader for a File. 590 * <p> 591 * It looks for the UTF-8 BOM first, if none sniffs the XML prolog charset, if this is also missing defaults to UTF-8. 592 * </p> 593 * <p> 594 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 595 * </p> 596 * 597 * @param file File to create a Reader from. 598 * @throws NullPointerException if the input is {@code null}. 599 * @throws IOException thrown if there is a problem reading the file. 600 * @since 2.11.0 601 * @deprecated Use {@link #builder()}, {@link Builder}, and {@link Builder#get()} 602 */ 603 @Deprecated 604 @SuppressWarnings("resource") // InputStream is managed through another reader in this instance. 605 public XmlStreamReader(final Path file) throws IOException { 606 this(Files.newInputStream(Objects.requireNonNull(file, "file"))); 607 } 608 609 /** 610 * Constructs a Reader using the InputStream of a URL. 611 * <p> 612 * If the URL is not of type HTTP and there is not 'content-type' header in the fetched data it uses the same logic used for Files. 613 * </p> 614 * <p> 615 * If the URL is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with content-type. 616 * </p> 617 * <p> 618 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 619 * </p> 620 * 621 * @param url URL to create a Reader from. 622 * @throws NullPointerException if the input is {@code null}. 623 * @throws IOException thrown if there is a problem reading the stream of the URL. 624 */ 625 public XmlStreamReader(final URL url) throws IOException { 626 this(Objects.requireNonNull(url, "url").openConnection(), null); 627 } 628 629 /** 630 * Constructs a Reader using the InputStream of a URLConnection. 631 * <p> 632 * If the URLConnection is not of type HttpURLConnection and there is not 'content-type' header in the fetched data it uses the same logic used for files. 633 * </p> 634 * <p> 635 * If the URLConnection is a HTTP Url or there is a 'content-type' header in the fetched data it uses the same logic used for an InputStream with 636 * content-type. 637 * </p> 638 * <p> 639 * It does a lenient charset encoding detection, check the constructor with the lenient parameter for details. 640 * </p> 641 * 642 * @param urlConnection URLConnection to create a Reader from. 643 * @param defaultEncoding The default encoding 644 * @throws NullPointerException if the input is {@code null}. 645 * @throws IOException thrown if there is a problem reading the stream of the URLConnection. 646 */ 647 public XmlStreamReader(final URLConnection urlConnection, final String defaultEncoding) throws IOException { 648 Objects.requireNonNull(urlConnection, "urlConnection"); 649 this.defaultEncoding = defaultEncoding; 650 final boolean lenient = true; 651 final String contentType = urlConnection.getContentType(); 652 final InputStream inputStream = urlConnection.getInputStream(); 653 @SuppressWarnings("resource") // managed by the InputStreamReader tracked by this instance 654 // @formatter:off 655 final BOMInputStream bomInput = BOMInputStream.builder() 656 .setInputStream(new BufferedInputStream(inputStream, IOUtils.DEFAULT_BUFFER_SIZE)) 657 .setInclude(false) 658 .setByteOrderMarks(BOMS) 659 .get(); 660 @SuppressWarnings("resource") 661 final BOMInputStream piInput = BOMInputStream.builder() 662 .setInputStream(new BufferedInputStream(bomInput, IOUtils.DEFAULT_BUFFER_SIZE)) 663 .setInclude(true) 664 .setByteOrderMarks(XML_GUESS_BYTES) 665 .get(); 666 // @formatter:on 667 if (urlConnection instanceof HttpURLConnection || contentType != null) { 668 this.encoding = processHttpStream(bomInput, piInput, lenient, contentType); 669 } else { 670 this.encoding = processHttpStream(bomInput, piInput, lenient); 671 } 672 this.reader = new InputStreamReader(piInput, encoding); 673 } 674 675 /** 676 * Calculates the HTTP encoding. 677 * @param bomEnc BOM encoding 678 * @param xmlGuessEnc XML Guess encoding 679 * @param xmlEnc XML encoding 680 * @param lenient indicates if the charset encoding detection should be relaxed. 681 * @param httpContentType The HTTP content type 682 * 683 * @return the HTTP encoding 684 * @throws IOException thrown if there is a problem reading the stream. 685 */ 686 String calculateHttpEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc, final boolean lenient, final String httpContentType) 687 throws IOException { 688 689 // Lenient and has XML encoding 690 if (lenient && xmlEnc != null) { 691 return xmlEnc; 692 } 693 694 // Determine mime/encoding content types from HTTP Content Type 695 final String cTMime = getContentTypeMime(httpContentType); 696 final String cTEnc = getContentTypeEncoding(httpContentType); 697 final boolean appXml = isAppXml(cTMime); 698 final boolean textXml = isTextXml(cTMime); 699 700 // Mime type NOT "application/xml" or "text/xml" 701 if (!appXml && !textXml) { 702 final String msg = MessageFormat.format(HTTP_EX_3, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 703 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 704 } 705 706 // No content type encoding 707 if (cTEnc == null) { 708 if (appXml) { 709 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 710 } 711 return defaultEncoding == null ? US_ASCII : defaultEncoding; 712 } 713 714 // UTF-16BE or UTF-16LE content type encoding 715 if (cTEnc.equals(UTF_16BE) || cTEnc.equals(UTF_16LE)) { 716 if (bomEnc != null) { 717 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 718 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 719 } 720 return cTEnc; 721 } 722 723 // UTF-16 content type encoding 724 if (cTEnc.equals(UTF_16)) { 725 if (bomEnc != null && bomEnc.startsWith(UTF_16)) { 726 return bomEnc; 727 } 728 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 729 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 730 } 731 732 // UTF-32BE or UTF-132E content type encoding 733 if (cTEnc.equals(UTF_32BE) || cTEnc.equals(UTF_32LE)) { 734 if (bomEnc != null) { 735 final String msg = MessageFormat.format(HTTP_EX_1, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 736 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 737 } 738 return cTEnc; 739 } 740 741 // UTF-32 content type encoding 742 if (cTEnc.equals(UTF_32)) { 743 if (bomEnc != null && bomEnc.startsWith(UTF_32)) { 744 return bomEnc; 745 } 746 final String msg = MessageFormat.format(HTTP_EX_2, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 747 throw new XmlStreamReaderException(msg, cTMime, cTEnc, bomEnc, xmlGuessEnc, xmlEnc); 748 } 749 750 return cTEnc; 751 } 752 753 /** 754 * Calculate the raw encoding. 755 * 756 * @param bomEnc BOM encoding 757 * @param xmlGuessEnc XML Guess encoding 758 * @param xmlEnc XML encoding 759 * @return the raw encoding 760 * @throws IOException thrown if there is a problem reading the stream. 761 */ 762 String calculateRawEncoding(final String bomEnc, final String xmlGuessEnc, final String xmlEnc) throws IOException { 763 764 // BOM is Null 765 if (bomEnc == null) { 766 if (xmlGuessEnc == null || xmlEnc == null) { 767 return defaultEncoding == null ? UTF_8 : defaultEncoding; 768 } 769 if (xmlEnc.equals(UTF_16) && (xmlGuessEnc.equals(UTF_16BE) || xmlGuessEnc.equals(UTF_16LE))) { 770 return xmlGuessEnc; 771 } 772 return xmlEnc; 773 } 774 775 // BOM is UTF-8 776 if (bomEnc.equals(UTF_8)) { 777 if (xmlGuessEnc != null && !xmlGuessEnc.equals(UTF_8)) { 778 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 779 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 780 } 781 if (xmlEnc != null && !xmlEnc.equals(UTF_8)) { 782 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 783 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 784 } 785 return bomEnc; 786 } 787 788 // BOM is UTF-16BE or UTF-16LE 789 if (bomEnc.equals(UTF_16BE) || bomEnc.equals(UTF_16LE)) { 790 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 791 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 792 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 793 } 794 if (xmlEnc != null && !xmlEnc.equals(UTF_16) && !xmlEnc.equals(bomEnc)) { 795 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 796 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 797 } 798 return bomEnc; 799 } 800 801 // BOM is UTF-32BE or UTF-32LE 802 if (bomEnc.equals(UTF_32BE) || bomEnc.equals(UTF_32LE)) { 803 if (xmlGuessEnc != null && !xmlGuessEnc.equals(bomEnc)) { 804 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 805 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 806 } 807 if (xmlEnc != null && !xmlEnc.equals(UTF_32) && !xmlEnc.equals(bomEnc)) { 808 final String msg = MessageFormat.format(RAW_EX_1, bomEnc, xmlGuessEnc, xmlEnc); 809 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 810 } 811 return bomEnc; 812 } 813 814 // BOM is something else 815 final String msg = MessageFormat.format(RAW_EX_2, bomEnc, xmlGuessEnc, xmlEnc); 816 throw new XmlStreamReaderException(msg, bomEnc, xmlGuessEnc, xmlEnc); 817 } 818 819 /** 820 * Closes the XmlStreamReader stream. 821 * 822 * @throws IOException thrown if there was a problem closing the stream. 823 */ 824 @Override 825 public void close() throws IOException { 826 reader.close(); 827 } 828 829 /** 830 * Does lenient detection. 831 * 832 * @param httpContentType content-type header to use for the resolution of the charset encoding. 833 * @param ex The thrown exception 834 * @return the encoding 835 * @throws IOException thrown if there is a problem reading the stream. 836 */ 837 private String doLenientDetection(String httpContentType, XmlStreamReaderException ex) throws IOException { 838 if (httpContentType != null && httpContentType.startsWith("text/html")) { 839 httpContentType = httpContentType.substring("text/html".length()); 840 httpContentType = "text/xml" + httpContentType; 841 try { 842 return calculateHttpEncoding(ex.getBomEncoding(), ex.getXmlGuessEncoding(), ex.getXmlEncoding(), true, httpContentType); 843 } catch (final XmlStreamReaderException ex2) { 844 ex = ex2; 845 } 846 } 847 String encoding = ex.getXmlEncoding(); 848 if (encoding == null) { 849 encoding = ex.getContentTypeEncoding(); 850 } 851 if (encoding == null) { 852 encoding = defaultEncoding == null ? UTF_8 : defaultEncoding; 853 } 854 return encoding; 855 } 856 857 /** 858 * Gets the default encoding to use if none is set in HTTP content-type, XML prolog and the rules based on content-type are not adequate. 859 * <p> 860 * If it is {@code null} the content-type based rules are used. 861 * </p> 862 * 863 * @return the default encoding to use. 864 */ 865 public String getDefaultEncoding() { 866 return defaultEncoding; 867 } 868 869 /** 870 * Gets the charset encoding of the XmlStreamReader. 871 * 872 * @return charset encoding. 873 */ 874 public String getEncoding() { 875 return encoding; 876 } 877 878 /** 879 * Process the raw stream. 880 * 881 * @param bomInput BOMInputStream to detect byte order marks 882 * @param piInput BOMInputStream to guess XML encoding 883 * @param lenient indicates if the charset encoding detection should be relaxed. 884 * @return the encoding to be used 885 * @throws IOException thrown if there is a problem reading the stream. 886 */ 887 private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient) throws IOException { 888 final String bomEnc = bomInput.getBOMCharsetName(); 889 final String xmlGuessEnc = piInput.getBOMCharsetName(); 890 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc); 891 try { 892 return calculateRawEncoding(bomEnc, xmlGuessEnc, xmlEnc); 893 } catch (final XmlStreamReaderException ex) { 894 if (lenient) { 895 return doLenientDetection(null, ex); 896 } 897 throw ex; 898 } 899 } 900 901 /** 902 * Processes an HTTP stream. 903 * 904 * @param bomInput BOMInputStream to detect byte order marks 905 * @param piInput BOMInputStream to guess XML encoding 906 * @param lenient indicates if the charset encoding detection should be relaxed. 907 * @param httpContentType The HTTP content type 908 * @return the encoding to be used 909 * @throws IOException thrown if there is a problem reading the stream. 910 */ 911 private String processHttpStream(final BOMInputStream bomInput, final BOMInputStream piInput, final boolean lenient, final String httpContentType) 912 throws IOException { 913 final String bomEnc = bomInput.getBOMCharsetName(); 914 final String xmlGuessEnc = piInput.getBOMCharsetName(); 915 final String xmlEnc = getXmlProlog(piInput, xmlGuessEnc); 916 try { 917 return calculateHttpEncoding(bomEnc, xmlGuessEnc, xmlEnc, lenient, httpContentType); 918 } catch (final XmlStreamReaderException ex) { 919 if (lenient) { 920 return doLenientDetection(httpContentType, ex); 921 } 922 throw ex; 923 } 924 } 925 926 /** 927 * Reads the underlying reader's {@code read(char[], int, int)} method. 928 * 929 * @param buf the buffer to read the characters into 930 * @param offset The start offset 931 * @param len The number of bytes to read 932 * @return the number of characters read or -1 if the end of stream 933 * @throws IOException if an I/O error occurs. 934 */ 935 @Override 936 public int read(final char[] buf, final int offset, final int len) throws IOException { 937 return reader.read(buf, offset, len); 938 } 939 940}