diff --git a/lib/src/char_encodings.dart b/lib/src/char_encodings.dart index 6120056ba6424..95755d04276c7 100644 --- a/lib/src/char_encodings.dart +++ b/lib/src/char_encodings.dart @@ -1,4 +1,4 @@ -import 'utf.dart'; +import 'dart:convert' show ascii, utf8; // TODO(jmesserly): this function is conspicuously absent from dart:utf. /// Returns true if the [bytes] starts with a UTF-8 byte order mark. @@ -20,29 +20,12 @@ bool hasUtf8Bom(List bytes, [int offset = 0, int length]) { Iterable decodeBytes(String encoding, List bytes) { switch (encoding) { case 'ascii': - // TODO(jmesserly): this was taken from runtime/bin/string_stream.dart - for (int byte in bytes) { - if (byte > 127) { - // TODO(jmesserly): ideally this would be DecoderException, like the - // one thrown in runtime/bin/string_stream.dart, but we don't want to - // depend on dart:io. - throw FormatException("Illegal ASCII character $byte"); - } - } - return bytes; + return ascii.decode(bytes).runes; case 'utf-8': - // NOTE: to match the behavior of the other decode functions, we eat the - // utf-8 BOM here. - - var offset = 0; - var length = bytes.length; - - if (hasUtf8Bom(bytes)) { - offset += 3; - length -= 3; - } - return decodeUtf8AsIterable(bytes, offset, length); + // NOTE: To match the behavior of the other decode functions, we eat the + // UTF-8 BOM here. This is the default behavior of `utf8.decode`. + return utf8.decode(bytes).runes; default: throw ArgumentError('Encoding $encoding not supported'); @@ -53,20 +36,5 @@ Iterable decodeBytes(String encoding, List bytes) { /// Returns the code points for the [input]. This works like [String.charCodes] /// but it decodes UTF-16 surrogate pairs. List toCodepoints(String input) { - var newCodes = []; - for (int i = 0; i < input.length; i++) { - var c = input.codeUnitAt(i); - if (0xD800 <= c && c <= 0xDBFF) { - int next = i + 1; - if (next < input.length) { - var d = input.codeUnitAt(next); - if (0xDC00 <= d && d <= 0xDFFF) { - c = 0x10000 + ((c - 0xD800) << 10) + (d - 0xDC00); - i = next; - } - } - } - newCodes.add(c); - } - return newCodes; + return input.runes.toList(); } diff --git a/lib/src/utf.dart b/lib/src/utf.dart deleted file mode 100644 index a635db8b805ec..0000000000000 --- a/lib/src/utf.dart +++ /dev/null @@ -1,237 +0,0 @@ -// Large portions of this code where taken from https://github.com/dart-lang/utf - -import "dart:collection"; - -const int _replacementCodepoint = 0xfffd; - -const int _UNICODE_VALID_RANGE_MAX = 0x10ffff; -const int _UNICODE_UTF16_RESERVED_LO = 0xd800; -const int _UNICODE_UTF16_RESERVED_HI = 0xdfff; - -const int _UTF8_ONE_BYTE_MAX = 0x7f; -const int _UTF8_TWO_BYTE_MAX = 0x7ff; -const int _UTF8_THREE_BYTE_MAX = 0xffff; - -const int _UTF8_LO_SIX_BIT_MASK = 0x3f; - -const int _UTF8_FIRST_BYTE_OF_TWO_BASE = 0xc0; -const int _UTF8_FIRST_BYTE_OF_THREE_BASE = 0xe0; -const int _UTF8_FIRST_BYTE_OF_FOUR_BASE = 0xf0; -const int _UTF8_FIRST_BYTE_OF_FIVE_BASE = 0xf8; -const int _UTF8_FIRST_BYTE_OF_SIX_BASE = 0xfc; - -const int _UTF8_FIRST_BYTE_BOUND_EXCL = 0xfe; - -/// Decodes the UTF-8 bytes as an iterable. Thus, the consumer can only convert -/// as much of the input as needed. Set the replacementCharacter to null to -/// throw an ArgumentError rather than replace the bad value. -Iterable decodeUtf8AsIterable(List bytes, int offset, int length) => - _IterableUtf8Decoder(bytes, offset, length); - -/// Return type of [decodeUtf8AsIterable] and variants. The Iterable type -/// provides an iterator on demand and the iterator will only translate bytes -/// as requested by the user of the iterator. (Note: results are not cached.) -// TODO(floitsch): Consider removing the extend and switch to implements since -// that's cheaper to allocate. -class _IterableUtf8Decoder extends IterableBase { - final List bytes; - final int offset; - final int length; - - _IterableUtf8Decoder(this.bytes, this.offset, this.length); - - _Utf8Decoder get iterator => _Utf8Decoder(bytes, offset, length); -} - -/// Provides an iterator of Unicode codepoints from UTF-8 encoded bytes. The -/// parameters can set an offset into a list of bytes (as int), limit the length -/// of the values to be decoded, and override the default Unicode replacement -/// character. Set the replacementCharacter to null to throw an -/// ArgumentError rather than replace the bad value. The return value -/// from this method can be used as an Iterable (e.g. in a for-loop). -class _Utf8Decoder implements Iterator { - final _ListRangeIterator utf8EncodedBytesIterator; - int _current; - - _Utf8Decoder(List utf8EncodedBytes, int offset, int length) - : utf8EncodedBytesIterator = - (_ListRange(utf8EncodedBytes, offset, length)).iterator; - - _Utf8Decoder._fromListRangeIterator(_ListRange source) - : utf8EncodedBytesIterator = source.iterator; - - /// Decode the remaininder of the characters in this decoder - /// into a [List]. - List decodeRest() { - List codepoints = List(utf8EncodedBytesIterator.remaining); - int i = 0; - while (moveNext()) { - codepoints[i++] = current; - } - if (i == codepoints.length) { - return codepoints; - } else { - List truncCodepoints = List(i); - truncCodepoints.setRange(0, i, codepoints); - return truncCodepoints; - } - } - - int get current => _current; - - bool moveNext() { - _current = null; - - if (!utf8EncodedBytesIterator.moveNext()) return false; - - int value = utf8EncodedBytesIterator.current; - int additionalBytes = 0; - - if (value < 0) { - if (_replacementCodepoint != null) { - _current = _replacementCodepoint; - return true; - } else { - throw ArgumentError( - "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); - } - } else if (value <= _UTF8_ONE_BYTE_MAX) { - _current = value; - return true; - } else if (value < _UTF8_FIRST_BYTE_OF_TWO_BASE) { - if (_replacementCodepoint != null) { - _current = _replacementCodepoint; - return true; - } else { - throw ArgumentError( - "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); - } - } else if (value < _UTF8_FIRST_BYTE_OF_THREE_BASE) { - value -= _UTF8_FIRST_BYTE_OF_TWO_BASE; - additionalBytes = 1; - } else if (value < _UTF8_FIRST_BYTE_OF_FOUR_BASE) { - value -= _UTF8_FIRST_BYTE_OF_THREE_BASE; - additionalBytes = 2; - } else if (value < _UTF8_FIRST_BYTE_OF_FIVE_BASE) { - value -= _UTF8_FIRST_BYTE_OF_FOUR_BASE; - additionalBytes = 3; - } else if (value < _UTF8_FIRST_BYTE_OF_SIX_BASE) { - value -= _UTF8_FIRST_BYTE_OF_FIVE_BASE; - additionalBytes = 4; - } else if (value < _UTF8_FIRST_BYTE_BOUND_EXCL) { - value -= _UTF8_FIRST_BYTE_OF_SIX_BASE; - additionalBytes = 5; - } else if (_replacementCodepoint != null) { - _current = _replacementCodepoint; - return true; - } else { - throw ArgumentError( - "Invalid UTF8 at ${utf8EncodedBytesIterator.position}"); - } - int j = 0; - while (j < additionalBytes && utf8EncodedBytesIterator.moveNext()) { - int nextValue = utf8EncodedBytesIterator.current; - if (nextValue > _UTF8_ONE_BYTE_MAX && - nextValue < _UTF8_FIRST_BYTE_OF_TWO_BASE) { - value = ((value << 6) | (nextValue & _UTF8_LO_SIX_BIT_MASK)); - } else { - // if sequence-starting code unit, reposition cursor to start here - if (nextValue >= _UTF8_FIRST_BYTE_OF_TWO_BASE) { - utf8EncodedBytesIterator.backup(); - } - break; - } - j++; - } - bool validSequence = (j == additionalBytes && - (value < _UNICODE_UTF16_RESERVED_LO || - value > _UNICODE_UTF16_RESERVED_HI)); - bool nonOverlong = (additionalBytes == 1 && value > _UTF8_ONE_BYTE_MAX) || - (additionalBytes == 2 && value > _UTF8_TWO_BYTE_MAX) || - (additionalBytes == 3 && value > _UTF8_THREE_BYTE_MAX); - bool inRange = value <= _UNICODE_VALID_RANGE_MAX; - if (validSequence && nonOverlong && inRange) { - _current = value; - return true; - } else if (_replacementCodepoint != null) { - _current = _replacementCodepoint; - return true; - } else { - throw ArgumentError( - "Invalid UTF8 at ${utf8EncodedBytesIterator.position - j}"); - } - } -} - -/// _ListRange in an internal type used to create a lightweight Interable on a -/// range within a source list. DO NOT MODIFY the underlying list while -/// iterating over it. The results of doing so are undefined. -// TODO(floitsch): Consider removing the extend and switch to implements since -// that's cheaper to allocate. -class _ListRange extends IterableBase { - final List _source; - final int _offset; - final int _length; - - _ListRange(List source, [int offset = 0, int length]) - : _source = source, - _offset = offset, - _length = (length == null ? source.length - offset : length) { - if (_offset < 0 || _offset > _source.length) { - throw RangeError.value(_offset); - } - if (_length != null && (_length < 0)) { - throw RangeError.value(_length); - } - if (_length + _offset > _source.length) { - throw RangeError.value(_length + _offset); - } - } - - _ListRangeIterator get iterator => - _ListRangeIteratorImpl(_source, _offset, _offset + _length); - - int get length => _length; -} - -/// The ListRangeIterator provides more capabilities than a standard iterator, -/// including the ability to get the current position, count remaining items, -/// and move forward/backward within the iterator. -abstract class _ListRangeIterator implements Iterator { - bool moveNext(); - - int get current; - - int get position; - - void backup([int by]); - - int get remaining; - - void skip([int count]); -} - -class _ListRangeIteratorImpl implements _ListRangeIterator { - final List _source; - int _offset; - final int _end; - - _ListRangeIteratorImpl(this._source, int offset, this._end) - : _offset = offset - 1; - - int get current => _source[_offset]; - - bool moveNext() => ++_offset < _end; - - int get position => _offset; - - void backup([int by = 1]) { - _offset -= by; - } - - int get remaining => _end - _offset - 1; - - void skip([int count = 1]) { - _offset += count; - } -} diff --git a/pubspec.yaml b/pubspec.yaml index 81dcae9816dcb..90c3b3363f4bc 100644 --- a/pubspec.yaml +++ b/pubspec.yaml @@ -1,5 +1,5 @@ name: html -version: 0.14.0 +version: 0.14.1-dev description: APIs for parsing and manipulating HTML content outside the browser. author: Dart Team @@ -16,4 +16,3 @@ dev_dependencies: path: ^1.6.2 pedantic: ^1.3.0 test: ^1.3.0 - utf: '>=0.9.0 <0.10.0' diff --git a/test/tokenizer_test.dart b/test/tokenizer_test.dart index 744a49666ff9d..59dd2aad83f7e 100644 --- a/test/tokenizer_test.dart +++ b/test/tokenizer_test.dart @@ -7,10 +7,8 @@ import 'dart:io'; import 'dart:mirrors'; import 'package:path/path.dart' as pathos; import 'package:test/test.dart'; -import 'package:html/src/char_encodings.dart'; import 'package:html/src/token.dart'; import 'package:html/src/tokenizer.dart'; -import 'package:utf/utf.dart'; import 'support.dart'; class TokenizerTestParser { @@ -24,7 +22,7 @@ class TokenizerTestParser { List parse(String str) { // Note: we need to pass bytes to the tokenizer if we want it to handle BOM. - var bytes = codepointsToUtf8(toCodepoints(str)); + var bytes = utf8.encode(str); var tokenizer = HtmlTokenizer(bytes, encoding: 'utf-8'); outputTokens = [];