Wrong encoding after XML identity transformation
Nico R.
n-roeser at gmx.net
Sat Mar 29 19:47:40 UTC 2014
Hello again!
A few hours ago, I wrote:
[…]
>
> I’m attaching my test code, which I hope is correct and readable.
It seems that the list software stripped off the attachment with the
test program, so here it is again, inline: :-/
// This document is encoded in UTF-8, with no BOM and with LF line endings.
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.PrintStream;
import java.util.ArrayList;
import java.util.List;
import javax.xml.parsers.DocumentBuilder;
import javax.xml.parsers.DocumentBuilderFactory;
import javax.xml.transform.OutputKeys;
import javax.xml.transform.Transformer;
import javax.xml.transform.TransformerFactory;
import javax.xml.transform.TransformerFactoryConfigurationError;
import javax.xml.transform.dom.DOMSource;
import javax.xml.transform.stream.StreamResult;
import org.w3c.dom.Document;
import org.w3c.dom.Element;
import org.w3c.dom.Text;
public class TransformerTest {
/** convenience method, prints a byte array in human-readable ASCII
chars */
private static void printArray(byte[] data, PrintStream out) {
if (data == null) {
out.println("(null)");
return;
}
if (data.length == 0) {
out.println("(empty array)");
return;
}
for (int i = 0; ; ) {
byte b = data[i];
if (b >= 0x20 && b <= 0x7F) {
out.print(" '" + (char)b + "'");
} else {
out.format("0x%02X", (int)b & 0xFF);
}
if (++i == data.length) break;
if ((i % 16) == 0) {
out.println(',');
} else {
out.print(", ");
}
}
out.println();
}
/**
* parses a {@code Document} from an {@code InputStream} (and not from a
* {@code StringReader}, in order to make sure it’s not a problem when
* parsing internal Unicode {@code String}s)
*/
private static Document makeDocument() throws Exception {
DocumentBuilder builder =
DocumentBuilderFactory.newInstance().newDocumentBuilder();
byte[] docBytes = {
'<', '?', 'x', 'm', 'l', ' ',
'v', 'e', 'r', 's', 'i', 'o', 'n', '=',
'"', '1', '.', '0', '"', ' ',
// document is encoded in US-ASCII, but labelled as a superset
'e', 'n', 'c', 'o', 'd', 'i', 'n', 'g', '=',
'"', 'I', 'S', 'O', '-', '8', '8', '5', '9', '-', '1', '"',
'?', '>',
'<', 'x', '/', '>'
};
return builder.parse(new ByteArrayInputStream(docBytes));
}
/** prepares a list of transformer factories to test */
private static List<TransformerFactory> makeTransformerFactories() {
String[] facNames = {
"org.apache.xalan.processor.TransformerFactoryImpl",
"org.apache.xalan.xsltc.trax.TransformerFactoryImpl",
"com.sun.org.apache.xalan.internal.xsltc.trax.TransformerFactoryImpl"
};
List<TransformerFactory> transFactories = new ArrayList<>();
for (String facName : facNames) {
try {
TransformerFactory factory =
TransformerFactory.newInstance(facName, null);
transFactories.add(factory);
} catch (TransformerFactoryConfigurationError ex) {
System.out.println("cannot build instance of " + facName);
}
}
//transFactories.add(TransformerFactory.newInstance()); //
default impl
return transFactories;
}
private static byte[] transformWith(TransformerFactory transFac,
Document doc) throws Exception {
Transformer trans = transFac.newTransformer();
trans.setOutputProperty(OutputKeys.METHOD, "xml");
trans.setOutputProperty(OutputKeys.ENCODING, "UTF-8");
System.out.println("transformer factory: " +
transFac.getClass().getName());
System.out.println("transformer: " + trans.getClass().getName());
ByteArrayOutputStream bout = new ByteArrayOutputStream(128);
trans.transform(new DOMSource(doc), new StreamResult(bout));
byte[] data = bout.toByteArray();
return data;
}
public static void main(String[] t) throws Exception {
Document doc = makeDocument();
System.out.println(
"encoding declared in document : " +
doc.getXmlEncoding());
// now insert a text node containing a character which can be
// represented in ISO-8859-1, as well as (differently) in UTF-8,
but not
// in US-ASCII
Element dstRoot = doc.getDocumentElement();
Text text = doc.createTextNode("schön");
dstRoot.insertBefore(text, null);
System.out.println("root element text: " +
dstRoot.getTextContent());
List<TransformerFactory> factories = makeTransformerFactories();
for (TransformerFactory factory : factories) {
System.out.println();
byte[] bytes = transformWith(factory, doc);
System.out.println("resulting bytes:");
printArray(bytes, System.out);
}
}
}
--
Nico
More information about the core-libs-dev
mailing list