URLConnection.guessContentTypeFromStream() does not support UTF8 and UTF32 with BOM

Charles Lee littlee at linux.vnet.ibm.com
Tue Mar 1 17:39:37 PST 2011


Hi guys,

With test case[1] below, you can see guessContent does not support 
UTF8/32 BOM. This problem could be solved with the patch[2].
The patch is straight forward:
1. read more bytes since UTF32
2. add xml type support in utf8 and utf32 BOM.

[1] test case:
public  static  void main(String[] args)throws  IOException {
         String  header ="<?xml";
	String[] encs =new  String[] {"UTF-8","UTF-16BE","UTF-16LE","UTF-32BE","UTF-32LE"};
	InputStream is;
	for  (String  enc : encs) {
             System.out.println(enc+":");
             is =new  ByteArrayInputStream(toBOMBytes(header, enc));
             String  mime = URLConnection.guessContentTypeFromStream(is);
             System.out.println(mime);

             is.close();
         }
}

private  static  byte[] toBOMBytes(String  text,String  enc)throws  IOException {
         ByteArrayOutputStream bos =new  ByteArrayOutputStream();

         if  (enc.equals("UTF-8")) {
             bos.write(new  byte[] { (byte) 0xEF, (byte) 0xBB, (byte) 0xBF });
         }
	if  (enc.equals("UTF-16BE")) {
	    bos.write(new  byte[] { (byte) 0xFE, (byte) 0xFF });
	}
         if  (enc.equals("UTF-16LE")) {
             bos.write(new  byte[] { (byte) 0xFF, (byte) 0xFE });
         }
         if  (enc.equals("UTF-32BE")) {
             bos.write(new  byte[] { (byte) 0x00, (byte) 0x00, (byte) 0xFE,
                                    (byte) 0xFF });
         }
         if  (enc.equals("UTF-32LE")) {
             bos.write(new  byte[] { (byte) 0xFF, (byte) 0xFE, (byte) 0x00,
                                    (byte) 0x00 });
         }

         bos.write(text.getBytes(enc));
         return  bos.toByteArray();
}

[2] patch:
diff --git src/share/classes/java/net/URLConnection.java src/share/classes/java/net/URLConnection.java
--- src/share/classes/java/net/URLConnection.java
+++ src/share/classes/java/net/URLConnection.java
@@ -1422,7 +1422,7 @@
          if (!is.markSupported())
              return null;

-        is.mark(12);
+        is.mark(16);
          int c1 = is.read();
          int c2 = is.read();
          int c3 = is.read();
@@ -1434,6 +1434,11 @@
          int c9 = is.read();
          int c10 = is.read();
          int c11 = is.read();
+	int c12 = is.read();
+	int c13 = is.read();
+	int c14 = is.read();
+	int c15 = is.read();
+	int c16 = is.read();
          is.reset();

          if (c1 == 0xCA&&  c2 == 0xFE&&  c3 == 0xBA&&  c4 == 0xBE) {
@@ -1461,6 +1466,13 @@
              }
          }

+	// big and little endian UTF-8 encodings, with BOM
+	if (c1 == 0xef&&  c2 == 0xbb&&  c3 == 0xbf) {
+	    if (c4 == '<'&&  c5 == '?'&&  c6 == 'x') {
+		return "application/xml";
+	    }
+	}
+
          // big and little endian UTF-16 encodings, with byte order mark
          if (c1 == 0xfe&&  c2 == 0xff) {
              if (c3 == 0&&  c4 == '<'&&  c5 == 0&&  c6 == '?'&&
@@ -1476,6 +1488,19 @@
              }
          }

+	// big and little endian UTF-32 encodings, with BOM
+	if (c1 == 0xff&&  c2 == 0xfe&&  c3 == 0x0&&  c4 == 0x0) {
+	    if (c5 == '<'&&  c9 == '?'&&  c13 == 'x') {
+		return "application/xml";
+	    }
+	}
+
+	if (c1 == 0x0&&  c2 == 0x0&&  c3 == 0xfe&&  c4 == 0xff) {
+	    if (c8 == '<'&&  c12 == '?'&&  c16 == 'x') {
+		return "application/xml";
+	    }
+	}
+
          if (c1 == 'G'&&  c2 == 'I'&&  c3 == 'F'&&  c4 == '8') {
              return "image/gif";
          }






More information about the net-dev mailing list