diff --git a/Makefile b/Makefile index 34d32bc09fd25..4b53beac67184 100644 --- a/Makefile +++ b/Makefile @@ -24,9 +24,11 @@ pcre_h.j: test: debug ./julia tests.j -testall: test +test-utf8: ./julia test_utf8.j +testall: test test-utf8 + SLOCCOUNT = sloccount \ --addlang makefile \ --personcost 100000 \ @@ -51,4 +53,4 @@ clean: cleanall: clean $(MAKE) -C src cleanother -.PHONY: default debug release julia-debug julia-release test testall sloccount clean cleanall +.PHONY: default debug release julia-debug julia-release test test-* testall sloccount clean cleanall diff --git a/ascii.j b/ascii.j new file mode 100644 index 0000000000000..3ebd446436c09 --- /dev/null +++ b/ascii.j @@ -0,0 +1,28 @@ +## from src/boot.j +# type ASCIIString <: String; data::Array{Uint8,1}; end + +next(s::ASCIIString, i::Index) = (char(s.data[i]), i+1) + +## overload methods for efficiency ## + +length(s::ASCIIString) = length(s.data) +cmp(a::ASCIIString, b::ASCIIString) = lexcmp(a.data, b.data) +ind2chr(s::ASCIIString, i::Int) = i +chr2ind(s::ASCIIString, i::Int) = i +strchr(s::ASCIIString, c::Char) = c < 0x80 ? memchr(s.data, c) : error("char not found") +nextind(s::ASCIIString, i::Int) = i +prevind(s::ASCIIString, i::Int) = i-1 +strcat(s::ASCIIString, t::ASCIIString, x::ASCIIString...) = ASCIIString(strdatacat(s, t, x...)) + +## outputing ASCII strings ## + +print(s::ASCIIString) = print(s.data) +write(io, s::ASCIIString) = write(io, s.data) + +## transcoding to ASCII ## + +ascii(s::ASCIIString) = s +function ascii(s::String) + f = c -> (c < 0x80) ? uint8(c) : error("invalid ASCII code point: U+$(hex(c))") + ASCIIString(map(f, chars(s))) +end diff --git a/expr.j b/expr.j index 661156a46fe13..2821624a4a872 100644 --- a/expr.j +++ b/expr.j @@ -1,6 +1,6 @@ ## symbols ## -symbol(s::Latin1String) = symbol(s.data) +symbol(s::ASCIIString) = symbol(s.data) symbol(s::UTF8String) = symbol(s.data) symbol(a::Array{Uint8,1}) = ccall(:jl_symbol_n, Any, (Ptr{Uint8}, Int32), a, int32(length(a)))::Symbol diff --git a/latin1.j b/latin1.j index d81fe5d6f174f..4256133a60ebd 100644 --- a/latin1.j +++ b/latin1.j @@ -1,7 +1,6 @@ -## from boot.j: -# type Latin1String <: String -# data::Array{Uint8,1} -# end +type Latin1String <: String + data::Array{Uint8,1} +end next(s::Latin1String, i::Index) = (char(s.data[i]), i+1) @@ -27,7 +26,6 @@ write(io, s::Latin1String) = write(io, s.data) latin1(s::Latin1String) = s function latin1(s::String) - f = c -> (c <= 0xff) ? uint8(c) : - error("invalid Latin-1 code point: U+$(hex(c))") + f = c -> (c <= 0xff) ? uint8(c) : error("invalid Latin-1 code point: U+$(hex(c))") Latin1String(map(f, chars(s))) end diff --git a/multi.j b/multi.j index c094df6f8124d..12549b852d398 100644 --- a/multi.j +++ b/multi.j @@ -194,7 +194,7 @@ function identify_socket(otherid, fd, sock) @assert i < PGRP.myid PGRP.workers[i] = Worker(locs[i].host, locs[i].port, fd, sock) PGRP.workers[i].id = i - #write(stdout_stream, latin1("$(PGRP.myid) heard from $i\n")) + #write(stdout_stream, "$(PGRP.myid) heard from $i\n") () end diff --git a/src/alloc.c b/src/alloc.c index c2939a0f82c86..5c7031db85b22 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -32,7 +32,7 @@ jl_type_t *jl_array_uint8_type; jl_type_t *jl_array_any_type; jl_struct_type_t *jl_weakref_type; jl_tag_type_t *jl_string_type; -jl_struct_type_t *jl_latin1_string_type; +jl_struct_type_t *jl_ascii_string_type; jl_struct_type_t *jl_utf8_string_type; jl_struct_type_t *jl_expr_type; jl_bits_type_t *jl_intrinsic_type; diff --git a/src/array.c b/src/array.c index d3ce60b8bf47f..22ebeb0a2d745 100644 --- a/src/array.c +++ b/src/array.c @@ -195,8 +195,8 @@ jl_value_t *jl_pchar_to_string(char *str, size_t len) { jl_array_t *a = jl_pchar_to_array(str, len); JL_GC_PUSH(&a); - jl_struct_type_t* string_type = u8_isvalid(a->data, len) < 2 ? - jl_latin1_string_type : jl_utf8_string_type; + jl_struct_type_t* string_type = u8_isvalid(a->data, len) == 1 ? // ASCII + jl_ascii_string_type : jl_utf8_string_type; jl_value_t *s = jl_apply((jl_function_t*)string_type, (jl_value_t**)&a, 1); JL_GC_POP(); return s; diff --git a/src/boot.j b/src/boot.j index f395d9beabfc0..a53fa08e4b018 100644 --- a/src/boot.j +++ b/src/boot.j @@ -117,15 +117,10 @@ isequal(w, v::WeakRef) = isequal(w, v.value) abstract String -type Latin1String <: String - data::Array{Uint8,1} -end - -type UTF8String <: String - data::Array{Uint8,1} -end +type ASCIIString <: String; data::Array{Uint8,1}; end +type UTF8String <: String; data::Array{Uint8,1}; end -typealias ByteString Union(Latin1String,UTF8String) +typealias ByteString Union(ASCIIString,UTF8String) abstract Exception diff --git a/src/dump.c b/src/dump.c index bc86d49d04894..5d15b2b6ce726 100644 --- a/src/dump.c +++ b/src/dump.c @@ -785,7 +785,7 @@ void jl_save_system_image(char *fname, char *startscriptname) jl_serialize_value(&f, jl_float64_type); jl_serialize_value(&f, jl_weakref_type); jl_serialize_value(&f, jl_string_type); - jl_serialize_value(&f, jl_latin1_string_type); + jl_serialize_value(&f, jl_ascii_string_type); jl_serialize_value(&f, jl_utf8_string_type); jl_serialize_value(&f, jl_errorexception_type); jl_serialize_value(&f, jl_typeerror_type); @@ -852,7 +852,7 @@ void jl_restore_system_image(char *fname) jl_weakref_type->env = NULL; jl_weakref_type->linfo = NULL; jl_string_type = (jl_tag_type_t*)jl_deserialize_value(&f); - jl_latin1_string_type = (jl_struct_type_t*)jl_deserialize_value(&f); + jl_ascii_string_type = (jl_struct_type_t*)jl_deserialize_value(&f); jl_utf8_string_type = (jl_struct_type_t*)jl_deserialize_value(&f); jl_errorexception_type = (jl_struct_type_t*)jl_deserialize_value(&f); jl_typeerror_type = (jl_struct_type_t*)jl_deserialize_value(&f); diff --git a/src/init.c b/src/init.c index 048e25a877dd8..34466871c28cd 100644 --- a/src/init.c +++ b/src/init.c @@ -251,7 +251,7 @@ void jl_get_builtin_hooks() jl_weakref_type->env = NULL; jl_weakref_type->linfo = NULL; jl_string_type = (jl_tag_type_t*)global("String"); - jl_latin1_string_type = (jl_struct_type_t*)global("Latin1String"); + jl_ascii_string_type = (jl_struct_type_t*)global("ASCIIString"); jl_utf8_string_type = (jl_struct_type_t*)global("UTF8String"); jl_errorexception_type = (jl_struct_type_t*)global("ErrorException"); jl_typeerror_type = (jl_struct_type_t*)global("TypeError"); diff --git a/src/julia.h b/src/julia.h index d2b0f8a030e10..dec4415a990a1 100644 --- a/src/julia.h +++ b/src/julia.h @@ -263,7 +263,7 @@ extern jl_struct_type_t *jl_array_type; extern jl_typename_t *jl_array_typename; extern jl_struct_type_t *jl_weakref_type; extern jl_tag_type_t *jl_string_type; -extern jl_struct_type_t *jl_latin1_string_type; +extern jl_struct_type_t *jl_ascii_string_type; extern jl_struct_type_t *jl_utf8_string_type; extern jl_struct_type_t *jl_errorexception_type; extern jl_struct_type_t *jl_typeerror_type; @@ -401,9 +401,9 @@ void *allocb_permanent(size_t sz); #define jl_is_task(v) jl_typeis(v,jl_task_type) #define jl_is_func(v) (jl_is_func_type(jl_typeof(v)) || jl_is_struct_type(v)) #define jl_is_function(v) jl_is_func(v) -#define jl_is_latin1_string(v) jl_typeis(v,jl_latin1_string_type) +#define jl_is_ascii_string(v) jl_typeis(v,jl_ascii_string_type) #define jl_is_utf8_string(v) jl_typeis(v,jl_utf8_string_type) -#define jl_is_byte_string(v) (jl_is_latin1_string(v) || jl_is_utf8_string(v)) +#define jl_is_byte_string(v) (jl_is_ascii_string(v) || jl_is_utf8_string(v)) #define jl_is_string(v) jl_subtype(v,(jl_value_t*)jl_string_type,1) #define jl_is_cpointer(v) jl_is_cpointer_type(jl_typeof(v)) #define jl_is_pointer(v) jl_is_cpointer_type(jl_typeof(v)) diff --git a/start.j b/start.j index c1b76323a9df2..92b414beb146c 100644 --- a/start.j +++ b/start.j @@ -36,7 +36,7 @@ set_current_output_stream(stdout_stream) stdin_stream = fdio(ccall(:jl_stdin, Int32, ())) stderr_stream = fdio(ccall(:jl_stderr, Int32, ())) load("string.j") -load("latin1.j") +load("ascii.j") load("utf8.j") load("show.j") load("regex.j") diff --git a/string.j b/string.j index a17581409087d..3857045f926d5 100644 --- a/string.j +++ b/string.j @@ -340,7 +340,9 @@ function print_escaped(s::String, q::Bool, xmax::Char) if q; print('"'); end end -print_escaped(s::Latin1String, q) = print_escaped(s, q, '\xff') +# TODO: make sure ASCII, Latin-1 and UTF-8 strings all get +# printed so that when input back they are equivalent. + print_escaped(s::String, q) = print_escaped(s, q, '\x7f') print_escaped(s::String) = print_escaped(s, false) print_quoted (s::String) = print_escaped(s, true) @@ -708,7 +710,7 @@ function uint2str(n::Int, b::Int) ccall(:uint2str, Ptr{Uint8}, (Ptr{Uint8}, Ulong, Uint64, Uint32), data, ulong(sz), uint64(n), uint32(b)) - Latin1String(data[1:(sz-1)]) # cut out terminating NUL + ASCIIString(data[1:(sz-1)]) # cut out terminating NUL end uint2str(n::Int, b::Int, len::Int) = lpad(uint2str(n,b),len,'0') diff --git a/sysimg.j b/sysimg.j index 31991633cf4a8..5ab4c95866cf2 100644 --- a/sysimg.j +++ b/sysimg.j @@ -33,7 +33,7 @@ load("io.j") ccall(:jl_set_memio_func, Void, ()) set_current_output_stream(make_stdout_stream()) # for error reporting load("string.j") -load("latin1.j") +load("ascii.j") load("utf8.j") load("show.j") diff --git a/table.j b/table.j index 4c8cb17af56c9..c86cd25b90567 100644 --- a/table.j +++ b/table.j @@ -81,11 +81,7 @@ function hash(a::Array) h end -# TODO: should we distinguish a UTF8String and -# a Latin1String containing the same exact data? - -hash(s::Union(UTF8String,Latin1String)) = - ccall(:memhash32, Uint32, (Ptr{Void}, Size), s.data, length(s.data)) +hash(s::ByteString) = ccall(:memhash32, Uint32, (Ptr{Void}, Size), s.data, length(s.data)) # hash table diff --git a/test_utf8.j b/test_utf8.j index b4e4e333744f3..a6bb73482d5e6 100644 --- a/test_utf8.j +++ b/test_utf8.j @@ -1,6 +1,6 @@ -utf32 = CharString(read(open("unicode/UTF-32LE.txt"), Char, 1112065)[2:]); -utf8 = UTF8String(read(open("unicode/UTF-8.txt"), Uint8, 4382595)[4:]); -@assert utf32 == utf8 +str1 = CharString(read(open("unicode/UTF-32LE.txt"), Char, 1112065)[2:]); +str2 = UTF8String(read(open("unicode/UTF-8.txt"), Uint8, 4382595)[4:]); +@assert str1 == str2 str1 = "∀ ε > 0, ∃ δ > 0: |x-y| < δ ⇒ |f(x)-f(y)| < ε" str2 = CharString( diff --git a/utf8.j b/utf8.j index 8c86e72fdf195..9f7044993bf49 100644 --- a/utf8.j +++ b/utf8.j @@ -1,7 +1,5 @@ -## from boot.j: -# type UTF8String <: String -# data::Array{Uint8,1} -# end +## from src/boot.j: +# type UTF8String <: String; data::Array{Uint8,1}; end ## basic UTF-8 decoding & iteration ##