Skip to content

Commit 14878d8

Browse files
committed
Checkpoint on work to disable CRLF->LF
This rewrites the remapping logic and consolidates it. DAFFODIL-1559
1 parent 9ec3a83 commit 14878d8

20 files changed

+512
-229
lines changed

daffodil-core/src/main/scala/org/apache/daffodil/dsom/Facets.scala

+43-2
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,41 @@ import java.math.BigInteger
2121
import scala.xml.Node
2222
import org.apache.daffodil.exceptions.Assert
2323
import org.apache.daffodil.dpath.NodeInfo.PrimType
24-
import org.apache.daffodil.xml.XMLUtils
24+
import org.apache.daffodil.xml.RemapPUAToXMLIllegalChar
25+
26+
object Facets {
27+
28+
/**
29+
* Remapper used to convert pattern facet values
30+
* so that they can describe the DFDL infoset (for use
31+
* in our limited Daffodil-itself validation, as well
32+
* as the same pattern being useful for full validation
33+
* by a regular XSD validator.
34+
*
35+
* A regular XML validator (ex: Xerces) will need to look at the
36+
* infoset as we've mapped it to the PUA. Hence, if the
37+
* pattern is looking for say, control characters, it cannot
38+
* look for control-A (U+0001), because that will have been
39+
* remapped to U+E001.
40+
*
41+
* So the pattern facet value will have E001 in it, likely
42+
* expressed as ``. That will work fine for
43+
* external validation by Xerces or other.
44+
*
45+
* But Daffodil's internal (aka limited) validation operates
46+
* on the regular DFDL infoset, before any remapping for XML occurs.
47+
*
48+
* So we instead map the pattern facet value itself down
49+
* so that the `` in the pattern turns into an actual
50+
* NUL (\u0000 or \x00) in the regex as is used for limited validation.
51+
*/
52+
private val remapper =
53+
new RemapPUAToXMLIllegalChar()
54+
}
2555

2656
trait Facets { self: Restriction =>
2757
import org.apache.daffodil.dsom.FacetTypes._
58+
import Facets._
2859

2960
private def retrieveFacetValueFromRestrictionBase(xml: Node, facetName: Facet.Type): String = {
3061
val res = xml \\ "restriction" \ facetName.toString() \ "@value"
@@ -151,7 +182,17 @@ trait Facets { self: Restriction =>
151182
// The XSD numeric character entity  can be used to match ASCII NUL
152183
// (char code 0).
153184
//
154-
val remapped: String = XMLUtils.remapPUAToXMLIllegalCharacters(v)
185+
// This remapping is for pattern facets, which are inside a DFDL schema,
186+
// and so will not contain CR characters, since XML reading will convert those
187+
// to LF. To discuss CR in this pattern we can't use `
` syntax because that
188+
// turns into a CR which gets turned into a LF. Plus the pattern value is
189+
// an XML attribute, the value of which gets its whitespace collapsed, all
190+
// line-ending chars converted to spaces, and adjacent spaces collapsed to one.
191+
//
192+
// So a pattern facet must use `\r` and '\n' to describe line-endings within the pattern.
193+
// And in general one must be careful about whitespace.
194+
//
195+
val remapped: String = remapper.remap(v)
155196
(f, remapped.r)
156197
}
157198
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,135 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.daffodil.util
18+
19+
/**
20+
* A abstract base for Remappers which convert strings.
21+
*
22+
* The public interface is just `def remap(s: String): String`.
23+
*
24+
* There are protected methods that implementations must provide.
25+
*
26+
* Contains shared implementation methods also.
27+
*
28+
* NOTE: This is inner loop stuff. Keep it and derived classes lean and fast.
29+
* Use a java-like coding style. While loops, not map/flatmap/etc. avoid tuples.
30+
*/
31+
trait CharacterSetRemapper {
32+
33+
/**
34+
* Remaps the string. Returns the original string object if no remapping is required.
35+
*/
36+
def remap(s: String): String = remapImpl(s)
37+
38+
/**
39+
* Remaps 1 character, does not consider any context.
40+
*/
41+
def remapChar(c: Char): Char = remap(0, c, 0).toChar
42+
43+
/**
44+
* Remaps characters. Provides the previous and following characters since some remappings
45+
* require this context.
46+
*
47+
* Plays a trick with negating the return value in order to avoid having to
48+
* return more than one value, which is potentially less efficient.
49+
*
50+
* @param prev The character prior to the one being considered. (Needed for surrogates)
51+
* @param curr The character under consideration for remapping.
52+
* @param next The next character afterwards. (Needed for surrogates and CRLF pairs)
53+
* @return The remapped character (as an Int) or that same remapped character Int
54+
* value negated, which signals that curr+next was remapped to a single character.
55+
* Such as is needed if CRLF is remapped to just LF.
56+
*/
57+
protected def remap (prev: Int, curr: Int, next: Int): Int
58+
59+
private def needsRemapping(s: String): Boolean = {
60+
// a one liner in scala,
61+
//
62+
// `s.exists{ remapChar(_) != _ }`
63+
//
64+
// but we need a fast java-like while loop...
65+
var pos = 0
66+
var c = 0.toChar
67+
val len = s.length
68+
if (len != 0)
69+
while (pos < len) {
70+
c = s(pos)
71+
if (remapChar(c) != c)
72+
return true
73+
pos += 1
74+
}
75+
false
76+
}
77+
78+
/**
79+
* Scan first to avoid allocating a new string when remapping is
80+
* not needed, which is the common case.
81+
* This is worth it to save the allocation overhead.
82+
*/
83+
private def remapImpl(s: String) = if (needsRemapping(s)) remapLoop(s) else s
84+
85+
/**
86+
* Because of surrogate pairs, and the difference between 16-bit string codepoints
87+
* and real character codes, lots of things that traverse strings need
88+
* to consider either the codepoint after (if current is a leading surrogate)
89+
* or codepoint before (if current is a trailing surrogate).
90+
*
91+
* This algorithm uses a StringBuilder which is not synchronized
92+
* so it is noticably faster than StringBuffer, and since the StringBuilder
93+
* is local to the function, we don't have to worry about any threading issues.
94+
* This makes for a noticeable speed increase.
95+
*/
96+
private def remapLoop(s: String): String = {
97+
98+
val len = s.length
99+
if (len == 0) return s
100+
101+
val sb = new StringBuilder()
102+
103+
var pos = 0;
104+
var prev = 0
105+
var curr = s(0).toInt
106+
var next = 0
107+
var newCurr = 0 // positive normally, but will be negative if we're to skip a char
108+
109+
while (pos < len) {
110+
next = if (pos + 1 < len) s(pos + 1) else 0
111+
//
112+
// sign of newCurr is negative if we're to skip 1 character
113+
// such as if the prior iteration collapsed a CRLF to just LF.
114+
//
115+
if (newCurr >= 0) {
116+
newCurr = remap(prev, curr, next)
117+
sb.append(
118+
if (newCurr < 0) {
119+
//
120+
// if newCurr is negative, it's still the replacement
121+
// remapped character code, just negated to indicate need to skip
122+
//
123+
(-newCurr).toChar
124+
}
125+
else newCurr.toChar
126+
)
127+
prev = curr
128+
}
129+
curr = next
130+
pos += 1
131+
}
132+
133+
sb.toString
134+
}
135+
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
/*
2+
* Licensed to the Apache Software Foundation (ASF) under one or more
3+
* contributor license agreements. See the NOTICE file distributed with
4+
* this work for additional information regarding copyright ownership.
5+
* The ASF licenses this file to You under the Apache License, Version 2.0
6+
* (the "License"); you may not use this file except in compliance with
7+
* the License. You may obtain a copy of the License at
8+
*
9+
* http://www.apache.org/licenses/LICENSE-2.0
10+
*
11+
* Unless required by applicable law or agreed to in writing, software
12+
* distributed under the License is distributed on an "AS IS" BASIS,
13+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14+
* See the License for the specific language governing permissions and
15+
* limitations under the License.
16+
*/
17+
package org.apache.daffodil.xml
18+
19+
import org.apache.daffodil.exceptions.Assert
20+
import org.apache.daffodil.util.CharacterSetRemapper
21+
22+
/**
23+
* Remaps illegal XML chars to the PUA, and optionally CR also to the PUA.
24+
*
25+
* Handles unpaired unicode surrogate code points properly (remaps them).
26+
*
27+
* Legal XML v1.0 chars are #x9 | #xA | #xD | [#x20-#xD7FF] | [#xE000-#xFFFD] | [#x10000-#x10FFFF]
28+
*
29+
* Normally XML also remaps CRLF to LF and CR (isolated) to LF, but this is problematic
30+
* when data must be preserved perfectly so we have options to turn that off.
31+
*
32+
* We also can check and error if the string contains conflicting PUA characters to begin with.
33+
*/
34+
final class RemapXMLIllegalCharToPUA (
35+
checkForExistingPUA: Boolean,
36+
replaceCRWithLF: Boolean)
37+
extends CharacterSetRemapper {
38+
39+
/**
40+
* Remaps to PUA. Note return is negated char code of replacement char if we're to skip a character
41+
*/
42+
override protected def remap(prev: Int, curr: Int, next: Int): Int = {
43+
val res: Int = curr match {
44+
case 0x9 => curr
45+
case 0xA => curr
46+
case 0xD =>
47+
if (next == 0xA) {
48+
// CRLF case.
49+
if (replaceCRWithLF)
50+
-0xA // CRLF => LF, standard XML behavior. Note negated.
51+
else
52+
0xE00D // remap CR to preserve it. Leave LF alone.
53+
} else {
54+
// isolated CR case
55+
if (replaceCRWithLF)
56+
0xA // isolated CR => LF, standard XML behavior. Note NOT negated.
57+
else
58+
0xE00D // remap isolated CR to LF.
59+
}
60+
case _ if (curr < 0x20) => curr + 0xE000 // ascii c0 controls
61+
case _ if (curr > 0xD7FF && curr < 0xE000) => { // surrogate chars
62+
if ((isLeadingSurrogate(curr) && isTrailingSurrogate(next)) ||
63+
(isTrailingSurrogate(curr) && isLeadingSurrogate(prev))) {
64+
// well formed surrogate pairs are preserved
65+
curr
66+
} else {
67+
// curr is an isolated surrogate, so to preserve we must remap to PUA
68+
curr + 0x1000
69+
}
70+
}
71+
case _ if (curr >= 0xE000 && curr <= 0xF8FF) => { // Unicode PUA is E000 to F8FF.
72+
if (checkForExistingPUA)
73+
Assert.usageError("Pre-existing Private Use Area (PUA) character found in data: '%s'".format(curr))
74+
else curr
75+
}
76+
case _ if (curr < 0xFFFE) => curr
77+
case 0xFFFE => 0xF0FE
78+
case 0xFFFF => 0xF0FF
79+
case _ =>
80+
Assert.usageError("Character code beyond U+FFFF. Codepoint: %s".format(curr))
81+
}
82+
res
83+
}
84+
85+
private def isLeadingSurrogate(c: Int) = {
86+
c >= 0xD800 && c <= 0xDBFF
87+
}
88+
89+
private def isTrailingSurrogate(c: Int) = {
90+
c >= 0xDC00 && c <= 0xDFFF
91+
}
92+
}
93+
94+
/**
95+
* Reverse of the RemapXMLIllegalCharToPUA mapping.
96+
*/
97+
final class RemapPUAToXMLIllegalChar()
98+
extends CharacterSetRemapper {
99+
100+
/**
101+
* This direction of remapping is simpler. No context characters are needed, and
102+
* it never returns a negated character code.
103+
*/
104+
override protected def remap(prevIgnored: Int, c: Int, nextIgnored: Int): Int = remapChar(c.toChar).toInt
105+
106+
override def remapChar(c: Char): Char = {
107+
val res = c match {
108+
case _ if (c >= 0xE000 && c <= 0xE01F) => c - 0xE000 // Ascii c0 controls
109+
case _ if (c >= 0xE800 && c <= 0xEFFF) => c - 0x1000 // surrogate codepoints
110+
case 0xF0FE => 0xFFFE
111+
case 0xF0FF => 0xFFFF
112+
case _ => c
113+
}
114+
res.toChar
115+
}
116+
}

0 commit comments

Comments
 (0)