From c2deef37a31b19e37b4ce7bd9b877148d39d4f90 Mon Sep 17 00:00:00 2001
From: Basil Crow
Date: Mon, 24 Apr 2023 13:42:59 -0700
Subject: [PATCH] Allow `PrettyPrintWriter` to replace invalid XML characters
when not running in quirks mode. Closes #335.
---
xstream-distribution/src/content/changes.html | 1 +
.../xstream/io/xml/PrettyPrintWriter.java | 62 ++++++++---
.../xstream/io/xml/PrettyPrintWriterTest.java | 103 +++++++++++++++++-
3 files changed, 148 insertions(+), 18 deletions(-)
diff --git a/xstream-distribution/src/content/changes.html b/xstream-distribution/src/content/changes.html
index a8c4d62ec..7df8a69ff 100644
--- a/xstream-distribution/src/content/changes.html
+++ b/xstream-distribution/src/content/changes.html
@@ -111,6 +111,7 @@ Upcoming 1.4.x maintenance release
Minor changes
+ - GHPR:#335: Allow PrettyPrintWriter to replace invalid XML characters when not running in quirks mode (by Basil Crow).
- GHPR:#331, GHI:#326: Fix handling of empty java.util.concurrent.atomic.AtomicReference (by Alex Blekhman of Atlassian).
- GHPR:#334: Fix remaining buffer size calculation in QuickWriter (by Higuchi Yuta).
- GHI:#342: Optimize internal handling of children in DomReader avoiding O(n²) access times for siblings (by Shiang-Yun Yang).
diff --git a/xstream/src/java/com/thoughtworks/xstream/io/xml/PrettyPrintWriter.java b/xstream/src/java/com/thoughtworks/xstream/io/xml/PrettyPrintWriter.java
index 5d6f752fd..a504785a5 100644
--- a/xstream/src/java/com/thoughtworks/xstream/io/xml/PrettyPrintWriter.java
+++ b/xstream/src/java/com/thoughtworks/xstream/io/xml/PrettyPrintWriter.java
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2004, 2005, 2006 Joe Walnes.
- * Copyright (C) 2006, 2007, 2008, 2009, 2011, 2013, 2014, 2015 XStream Committers.
+ * Copyright (C) 2006, 2007, 2008, 2009, 2011, 2013, 2014, 2015, 2023 XStream Committers.
* All rights reserved.
*
* The software in this package is published under the terms of the BSD
@@ -42,6 +42,8 @@
* href="http://www.w3.org/TR/2006/REC-xml11-20060816/#charsets">1.1. If a character is not supported, a
* {@link StreamException} is thrown. Select a proper parser implementation that respects the version in the XML header
* (the Xpp3 parser will also read character entities of normally invalid characters).
+ * You may also switch to XML_1_0_REPLACEMENT or XML_1_1_REPLACEMENT mode, which will replace the invalid characters
+ * with a U+FFFD replacement character.
*
*
* @author Joe Walnes
@@ -52,6 +54,8 @@ public class PrettyPrintWriter extends AbstractXmlWriter {
public static int XML_QUIRKS = -1;
public static int XML_1_0 = 0;
public static int XML_1_1 = 1;
+ public static int XML_1_0_REPLACEMENT = 2;
+ public static int XML_1_1_REPLACEMENT = 3;
private final QuickWriter writer;
private final FastStack elementStack = new FastStack<>(16);
@@ -71,6 +75,7 @@ public class PrettyPrintWriter extends AbstractXmlWriter {
private static final char[] QUOT = """.toCharArray();
private static final char[] APOS = "'".toCharArray();
private static final char[] CLOSE = "".toCharArray();
+ private static final char[] REPLACEMENT = "�".toCharArray();
/**
* @since 1.4
@@ -80,8 +85,8 @@ public PrettyPrintWriter(final Writer writer, final int mode, final char[] lineI
this.writer = new QuickWriter(writer);
this.lineIndenter = lineIndenter;
this.mode = mode;
- if (mode < XML_QUIRKS || mode > XML_1_1) {
- throw new IllegalArgumentException("Not a valid XML mode");
+ if (mode < XML_QUIRKS || mode > XML_1_1_REPLACEMENT) {
+ throw new IllegalArgumentException("Not a valid XML mode: " + mode);
}
}
@@ -213,6 +218,8 @@ private void writeText(final String text, final boolean isAttribute) {
case '\0':
if (mode == XML_QUIRKS) {
writer.write(NULL);
+ } else if (mode == XML_1_0_REPLACEMENT || mode == XML_1_1_REPLACEMENT) {
+ writer.write(REPLACEMENT);
} else {
throw new StreamException("Invalid character 0x0 in XML stream");
}
@@ -244,32 +251,53 @@ private void writeText(final String text, final boolean isAttribute) {
//$FALL-THROUGH$
default:
if (Character.isDefined(c) && !Character.isISOControl(c)) {
+ boolean replaced = false;
if (mode != XML_QUIRKS) {
if (c > '\ud7ff' && c < '\ue000') {
- throw new StreamException("Invalid character 0x"
- + Integer.toHexString(c)
- + " in XML stream");
+ if (mode == XML_1_0_REPLACEMENT || mode == XML_1_1_REPLACEMENT) {
+ writer.write(REPLACEMENT);
+ replaced = true;
+ } else {
+ throw new StreamException("Invalid character 0x"
+ + Integer.toHexString(c)
+ + " in XML stream");
+ }
}
}
- writer.write(c);
+ if (!replaced) {
+ writer.write(c);
+ }
} else {
- if (mode == XML_1_0) {
+ boolean replaced = false;
+ if (mode == XML_1_0 || mode == XML_1_0_REPLACEMENT) {
if (c < 9 || c == '\u000b' || c == '\u000c' || c == '\u000e' || c >= '\u000f' && c <= '\u001f') {
- throw new StreamException("Invalid character 0x"
- + Integer.toHexString(c)
- + " in XML 1.0 stream");
+ if (mode == XML_1_0_REPLACEMENT) {
+ writer.write(REPLACEMENT);
+ replaced = true;
+ } else {
+ throw new StreamException("Invalid character 0x"
+ + Integer.toHexString(c)
+ + " in XML 1.0 stream");
+ }
}
}
if (mode != XML_QUIRKS) {
if (c == '\ufffe' || c == '\uffff') {
- throw new StreamException("Invalid character 0x"
- + Integer.toHexString(c)
- + " in XML stream");
+ if (mode == XML_1_0_REPLACEMENT || mode == XML_1_1_REPLACEMENT) {
+ writer.write(REPLACEMENT);
+ replaced = true;
+ } else {
+ throw new StreamException("Invalid character 0x"
+ + Integer.toHexString(c)
+ + " in XML stream");
+ }
}
}
- writer.write("");
- writer.write(Integer.toHexString(c));
- writer.write(';');
+ if (!replaced) {
+ writer.write("");
+ writer.write(Integer.toHexString(c));
+ writer.write(';');
+ }
}
}
}
diff --git a/xstream/src/test/com/thoughtworks/xstream/io/xml/PrettyPrintWriterTest.java b/xstream/src/test/com/thoughtworks/xstream/io/xml/PrettyPrintWriterTest.java
index fe64c1c1a..3ba459020 100644
--- a/xstream/src/test/com/thoughtworks/xstream/io/xml/PrettyPrintWriterTest.java
+++ b/xstream/src/test/com/thoughtworks/xstream/io/xml/PrettyPrintWriterTest.java
@@ -1,6 +1,6 @@
/*
* Copyright (C) 2004, 2005 Joe Walnes.
- * Copyright (C) 2006, 2007, 2008, 2013, 2018 XStream Committers.
+ * Copyright (C) 2006, 2007, 2008, 2013, 2018, 2023 XStream Committers.
* All rights reserved.
*
* The software in this package is published under the terms of the BSD
@@ -168,6 +168,24 @@ public void testThrowsForNullInXml1_1Mode() {
}
}
+ public void testReplacesNullInXml1_0ReplacementMode() {
+ writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_0_REPLACEMENT);
+ writer.startNode("tag");
+ writer.setValue("\u0000");
+ writer.endNode();
+
+ assertXmlProducedIs("�");
+ }
+
+ public void testReplacesNullInXml1_1ReplacementMode() {
+ writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_1_REPLACEMENT);
+ writer.startNode("tag");
+ writer.setValue("\u0000");
+ writer.endNode();
+
+ assertXmlProducedIs("�");
+ }
+
public void testSupportsOnlyValidControlCharactersInXml1_0Mode() {
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_0);
writer.startNode("tag");
@@ -237,6 +255,65 @@ public void testSupportsOnlyValidControlCharactersInXml1_1Mode() {
+ "");
}
+ public void testReplacesInvalidControlCharactersInXml1_0ReplacementMode() {
+ writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_0_REPLACEMENT);
+ writer.startNode("tag");
+ final String ctrl = ""
+ + "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007"
+ + "\u0008\u0009\n\u000b\u000c\r\u000e\u000f"
+ + "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017"
+ + "\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f"
+ + "\u007f"
+ + "\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"
+ + "\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f"
+ + "\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097"
+ + "\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f"
+ + "";
+ for (int i = 0; i < ctrl.length(); i++) {
+ final char c = ctrl.charAt(i);
+ writer.setValue(new Character(c).toString());
+ }
+ writer.endNode();
+
+ assertXmlProducedIs("��������"
+ + "�\t\n��
��"
+ + "��������"
+ + "��������"
+ + ""
+ + "
"
+ + ""
+ + ""
+ + ""); }
+
+ public void testReplacesInvalidControlCharactersInXml1_1ReplacementMode() {
+ writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_1_REPLACEMENT);
+ writer.startNode("tag");
+ final String ctrl = ""
+ + "\u0000\u0001\u0002\u0003\u0004\u0005\u0006\u0007"
+ + "\u0008\u0009\n\u000b\u000c\r\u000e\u000f"
+ + "\u0010\u0011\u0012\u0013\u0014\u0015\u0016\u0017"
+ + "\u0018\u0019\u001a\u001b\u001c\u001d\u001e\u001f"
+ + "\u007f"
+ + "\u0080\u0081\u0082\u0083\u0084\u0085\u0086\u0087"
+ + "\u0088\u0089\u008a\u008b\u008c\u008d\u008e\u008f"
+ + "\u0090\u0091\u0092\u0093\u0094\u0095\u0096\u0097"
+ + "\u0098\u0099\u009a\u009b\u009c\u009d\u009e\u009f"
+ + "";
+ for (int i = 0; i < ctrl.length(); i++) {
+ final char c = ctrl.charAt(i);
+ writer.setValue(new Character(c).toString());
+ }
+ writer.endNode();
+ assertXmlProducedIs("�"
+ + "\t\n
"
+ + ""
+ + ""
+ + "
"
+ + ""
+ + ""
+ + "");
+ }
+
public void testSupportsInvalidUnicodeCharacterslInQuirksMode() {
writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_QUIRKS);
writer.startNode("tag");
@@ -295,6 +372,30 @@ public void testThrowsForInvalidUnicodeCharacterslInXml1_1Mode() {
assertXmlProducedIs("\ue000\ufffd");
}
+ public void testReplacesInvalidUnicodeCharactersInXml1_0ReplacementMode() {
+ writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_0_REPLACEMENT);
+ writer.startNode("tag");
+ final String ctrl = "\ud7ff\ud800\udfff\ue000\ufffd\ufffe\uffff";
+ for (int i = 0; i < ctrl.length(); i++) {
+ final char c = ctrl.charAt(i);
+ writer.setValue(new Character(c).toString());
+ }
+ writer.endNode();
+ assertXmlProducedIs("��\ue000\ufffd��");
+ }
+
+ public void testReplacesInvalidUnicodeCharactersInXml1_1ReplacementMode() {
+ writer = new PrettyPrintWriter(buffer, PrettyPrintWriter.XML_1_1_REPLACEMENT);
+ writer.startNode("tag");
+ final String ctrl = "\ud7ff\ud800\udfff\ue000\ufffd\ufffe\uffff";
+ for (int i = 0; i < ctrl.length(); i++) {
+ final char c = ctrl.charAt(i);
+ writer.setValue(new Character(c).toString());
+ }
+ writer.endNode();
+ assertXmlProducedIs("��\ue000\ufffd��");
+ }
+
private String replace(final String in, final char what, final String with) {
final int pos = in.indexOf(what);
if (pos == -1) {