Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Upgrade to PCRE2 #11447

Merged
merged 2 commits into from
Jun 1, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ notifications:
before_install:
- if [ `uname` = "Linux" ]; then
BUILDOPTS="USEGCC=1 LLVM_CONFIG=llvm-config-3.3 LLVM_LLC=llc-3.3 VERBOSE=1 USE_BLAS64=0";
for lib in LLVM ZLIB SUITESPARSE ARPACK BLAS FFTW LAPACK GMP MPFR PCRE LIBUNWIND OPENLIBM RMATH; do
for lib in LLVM ZLIB SUITESPARSE ARPACK BLAS FFTW LAPACK GMP MPFR LIBUNWIND OPENLIBM RMATH; do
export BUILDOPTS="$BUILDOPTS USE_SYSTEM_$lib=1";
done;
sudo add-apt-repository ppa:staticfloat/julia-deps -y;
Expand All @@ -42,7 +42,7 @@ before_install:
brew install -v --only-dependencies --HEAD julia;
BUILDOPTS="USECLANG=1 LLVM_CONFIG=$(brew --prefix llvm33-julia)/bin/llvm-config-3.3 VERBOSE=1 USE_BLAS64=0 SUITESPARSE_INC=-I$(brew --prefix suite-sparse-julia)/include";
BUILDOPTS="$BUILDOPTS LIBBLAS=-lopenblas LIBBLASNAME=libopenblas LIBLAPACK=-lopenblas LIBLAPACKNAME=libopenblas";
for lib in LLVM ZLIB SUITESPARSE ARPACK BLAS FFTW LAPACK GMP MPFR PCRE LIBUNWIND LIBGIT2; do
for lib in LLVM ZLIB SUITESPARSE ARPACK BLAS FFTW LAPACK GMP MPFR LIBUNWIND LIBGIT2; do
export BUILDOPTS="$BUILDOPTS USE_SYSTEM_$lib=1";
done;
export LDFLAGS="-L$(brew --prefix openblas-julia)/lib -L$(brew --prefix suite-sparse-julia)/lib";
Expand Down
4 changes: 2 additions & 2 deletions Make.inc
Original file line number Diff line number Diff line change
Expand Up @@ -557,9 +557,9 @@ LLVM_LLC=$(build_bindir)/llc$(EXE)
endif

ifeq ($(USE_SYSTEM_PCRE), 1)
PCRE_CONFIG = pcre-config
PCRE_CONFIG = pcre2-config
else
PCRE_CONFIG = $(build_bindir)/pcre-config
PCRE_CONFIG = $(build_bindir)/pcre2-config
endif

# Use 64-bit libraries by default on 64-bit architectures
Expand Down
11 changes: 6 additions & 5 deletions base/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,6 @@ JULIAHOME = ..
include ../deps/Versions.make
include ../Make.inc

PCRE_CONST = 0x[0-9a-fA-F]+|[-+]?\s*[0-9]+

TAGGED_RELEASE_BANNER = ""

ifneq ($(USEMSVC), 1)
Expand All @@ -14,13 +12,16 @@ endif

all: pcre_h.jl errno_h.jl build_h.jl.phony fenv_constants.jl file_constants.jl uv_constants.jl version_git.jl.phony

pcre_h.jl:
PCRE_CONST = 0x[0-9a-fA-F]+|[0-9]+
ifeq ($(USE_SYSTEM_PCRE), 1)
@$(call PRINT_PERL, $(CPP) -dM $(shell $(PCRE_CONFIG) --prefix)/include/pcre.h | perl -nle '/^\s*#define\s+PCRE_(\w*)\s*\(?($(PCRE_CONST))\)?\s*$$/ and print "const $$1 = Int32($$2)"' | sort > $@)
PCRE_INCL_PATH = $(shell $(PCRE_CONFIG) --prefix)/include/pcre2.h
else
@$(call PRINT_PERL, $(CPP) -dM $(build_includedir)/pcre.h | perl -nle '/^\s*#define\s+PCRE_(\w*)\s*\(?($(PCRE_CONST))\)?\s*$$/ and print "const $$1 = Int32($$2)"' | sort > $@)
PCRE_INCL_PATH = $(build_includedir)/pcre2.h
endif

pcre_h.jl:
@$(call PRINT_PERL, $(CPP) -D PCRE2_CODE_UNIT_WIDTH=8 -dM $(PCRE_INCL_PATH) | perl -nle '/^\s*#define\s+PCRE2_(\w*)\s*\(?($(PCRE_CONST))\)?u?\s*$$/ and print "const $$1 = UInt32($$2)"' | sort > $@)

errno_h.jl:
@$(call PRINT_PERL, echo '#include "errno.h"' | $(CPP) -dM - | perl -nle 'print "const $$1 = Int32($$2)" if /^#define\s+(E\w+)\s+(\d+)\s*$$/' | sort > $@)

Expand Down
147 changes: 69 additions & 78 deletions base/pcre.jl
Original file line number Diff line number Diff line change
@@ -1,19 +1,26 @@
# This file is a part of Julia. License is MIT: http://julialang.org/license

## low-level pcre interface ##
## low-level pcre2 interface ##

module PCRE

include("pcre_h.jl")

const VERSION = bytestring(ccall((:pcre_version, :libpcre), Ptr{UInt8}, ()))
const PCRE_LIB = "libpcre2-8"

global JIT_STACK = C_NULL
global MATCH_CONTEXT = C_NULL

function __init__()
JIT_STACK_START_SIZE = 32768
JIT_STACK_MAX_SIZE = 1048576
global JIT_STACK = ccall((:pcre_jit_stack_alloc, :libpcre), Ptr{Void},
(Cint, Cint), JIT_STACK_START_SIZE, JIT_STACK_MAX_SIZE)
global JIT_STACK = ccall((:pcre2_jit_stack_create_8, PCRE_LIB), Ptr{Void},
(Cint, Cint, Ptr{Void}),
JIT_STACK_START_SIZE, JIT_STACK_MAX_SIZE, C_NULL)
global MATCH_CONTEXT = ccall((:pcre2_match_context_create_8, PCRE_LIB),
Ptr{Void}, (Ptr{Void},), C_NULL)
ccall((:pcre2_jit_stack_assign_8, PCRE_LIB), Void,
(Ptr{Void}, Ptr{Void}, Ptr{Void}), MATCH_CONTEXT, C_NULL, JIT_STACK)
end

# supported options for different use cases
Expand All @@ -25,7 +32,6 @@ const COMPILE_MASK =
DOTALL |
EXTENDED |
FIRSTLINE |
JAVASCRIPT_COMPAT |
MULTILINE |
NEWLINE_ANY |
NEWLINE_ANYCRLF |
Expand All @@ -34,9 +40,9 @@ const COMPILE_MASK =
NEWLINE_LF |
NO_AUTO_CAPTURE |
NO_START_OPTIMIZE |
NO_UTF8_CHECK |
NO_UTF_CHECK |
UNGREEDY |
UTF8
UTF

const EXECUTE_MASK =
NEWLINE_ANY |
Expand All @@ -49,20 +55,20 @@ const EXECUTE_MASK =
NOTEMPTY_ATSTART |
NOTEOL |
NO_START_OPTIMIZE |
NO_UTF8_CHECK |
NO_UTF_CHECK |
PARTIAL_HARD |
PARTIAL_SOFT


const OPTIONS_MASK = COMPILE_MASK | EXECUTE_MASK

function info{T}(
regex::Ptr{Void},
extra::Ptr{Void}, what::Integer, ::Type{T}
)
const UNSET = ~Csize_t(0) # Indicates that an output vector element is unset

function info(regex::Ptr{Void}, what::Integer, T)
buf = zeros(UInt8,sizeof(T))
ret = ccall((:pcre_fullinfo, :libpcre), Int32,
(Ptr{Void}, Ptr{Void}, Int32, Ptr{UInt8}),
regex, extra, what, buf)
ret = ccall((:pcre2_pattern_info_8, PCRE_LIB), Int32,
(Ptr{Void}, Int32, Ptr{UInt8}),
regex, what, buf)
if ret != 0
error(ret == ERROR_NULL ? "NULL regex object" :
ret == ERROR_BADMAGIC ? "invalid regex object" :
Expand All @@ -72,83 +78,68 @@ function info{T}(
reinterpret(T,buf)[1]
end

function config{T}(what::Integer, ::Type{T})
buf = zeros(UInt8, sizeof(T))
ret = ccall((:pcre_config, :libpcre), Int32,
(Int32, Ptr{UInt8}),
what, buf)

if ret != 0
error("PCRE.config error code $n")
end
reinterpret(T,buf)[1]
function get_ovec(match_data)
ptr = ccall((:pcre2_get_ovector_pointer_8, PCRE_LIB), Ptr{Csize_t},
(Ptr{Void},), match_data)
n = ccall((:pcre2_get_ovector_count_8, PCRE_LIB), UInt32,
(Ptr{Void},), match_data)
pointer_to_array(ptr, 2n, false)
end

function compile(pattern::AbstractString, options::Integer)
errstr = Array(Ptr{UInt8},1)
errstr[1] = C_NULL
erroff = zeros(Int32,1)
re_ptr = ccall((:pcre_compile, :libpcre), Ptr{Void},
(Cstring, Int32, Ptr{Ptr{UInt8}}, Ptr{Int32}, Ptr{UInt8}),
pattern, options, errstr, erroff, C_NULL)
if re_ptr == C_NULL
error("$(bytestring(errstr[1]))",
" at position $(erroff[1]+1)",
" in $(repr(pattern))")
end

errno = Ref{Int32}(0)
erroff = Ref{UInt32}(0)
re_ptr = ccall((:pcre2_compile_8, PCRE_LIB), Ptr{Void},
(Cstring, UInt32, UInt32, Ref{Int32}, Ref{UInt32}, Ptr{Void}),
pattern, sizeof(pattern), options, errno, erroff, C_NULL)
re_ptr == C_NULL && error("PCRE compilation error: $(err_message(errno[])) at offset $(erroff[])")
re_ptr
end

function study(regex::Ptr{Void}, options::Integer)
# NOTE: options should always be zero in current PCRE
errstr = Array(Ptr{UInt8},1)
errstr[1] = C_NULL
extra = ccall((:pcre_study, :libpcre), Ptr{Void},
(Ptr{Void}, Int32, Ptr{Ptr{UInt8}}),
regex, options, errstr)
if errstr[1] != C_NULL
error("$(bytestring(errstr[1]))")
end

ccall((:pcre_assign_jit_stack, :libpcre), Void,
(Ptr{Void}, Ptr{Void}, Ptr{Void}),
extra, C_NULL, JIT_STACK)
extra
function jit_compile(regex::Ptr{Void})
errno = ccall((:pcre2_jit_compile_8, PCRE_LIB), UInt32,
(Ptr{Void}, Int32),
regex, JIT_COMPLETE)
errno == 0 || error("PCRE JIT error: $(err_message(errno))")
end

study(re::Ptr{Void}) = study(re, Int32(0))
free_match_data(match_data) =
ccall((:pcre2_match_data_free_8, PCRE_LIB), Void, (Ptr{Void},), match_data)

free_re(re) =
ccall((:pcre2_code_free_8, PCRE_LIB), Void, (Ptr{Void},), re)

free_jit_stack(stack) =
ccall((:pcre2_jit_stack_free_8, PCRE_LIB), Void, (Ptr{Void},), stack)

free_study(extra::Ptr{Void}) =
ccall((:pcre_free_study, :libpcre), Void, (Ptr{Void},), extra)
free(regex::Ptr{Void}) =
ccall(unsafe_load(cglobal((:pcre_free, :libpcre),Ptr{Void})), Void, (Ptr{Void},), regex)
free_match_context(context) =
ccall((:pcre2_match_context_free_8, PCRE_LIB), Void, (Ptr{Void},), context)

function exec(regex::Ptr{Void}, extra::Ptr{Void}, str::SubString, offset::Integer,
options::Integer, ovec::Vector{Int32})
return exec(regex, extra, str.string, str.offset, offset, sizeof(str),
options, ovec)
function err_message(errno)
buffer = Array(UInt8, 256)
ccall((:pcre2_get_error_message_8, PCRE_LIB), Void,
(Int32, Ptr{UInt8}, UInt32), errno, buffer, sizeof(buffer))
bytestring(pointer(buffer))
end

function exec(regex::Ptr{Void}, extra::Ptr{Void}, str::ByteString, offset::Integer,
options::Integer, ovec::Vector{Int32})
return exec(regex, extra, str, 0, offset, sizeof(str), options, ovec)
function exec(re,subject,offset,options,match_data)
rc = ccall((:pcre2_match_8, PCRE_LIB), Cint,
(Ptr{Void}, Cstring, Csize_t, Csize_t, Cuint, Ptr{Void}, Ptr{Void}),
re, subject, sizeof(subject), offset, options, match_data, MATCH_CONTEXT)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, this is subtle but I think that using Cstring here may be inappropriate since we're a) passing the length of the data as another argument, rather than relying on NUL-termination of the data and b) apparently sometimes passing data that contains NUL bytes? cc: @stevengj

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I agree with @StefanKarpinski , it should not use Cstring, if the interface is with the length.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems to be revealing a more troubling problem, which is that junk strings seem to be getting passed in here – such as "\t:(Int^M:N)\0|>dump\n" – note the embedded NUL byte. The real issue, of course, is what the heck is this data and why is it being passed into the exec function?

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ok, this is not as nefarious as I thought it was – subject is just the string we're looking for matches in. But Cstring is inappropriate here and possibly in a few other places in this change.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You guys are totally right. And when you're right you're right.

On Monday, June 1, 2015, Stefan Karpinski [email protected] wrote:

In base/pcre.jl
#11447 (comment):

end

-function exec(regex::Ptr{Void}, extra::Ptr{Void}, str::ByteString, offset::Integer,

  •          options::Integer, ovec::Vector{Int32})
    
  • return exec(regex, extra, str, 0, offset, sizeof(str), options, ovec)
    +function exec(re,subject,offset,options,match_data)
  • rc = ccall((:pcre2_match_8, PCRE_LIB), Cint,
  •           (Ptr{Void}, Cstring, Csize_t, Csize_t, Cuint, Ptr{Void}, Ptr{Void}),
    
  •           re, subject, sizeof(subject), offset, options, match_data, MATCH_CONTEXT)
    

Ok, this is not as nefarious as I thought it was – subject is just the
string we're looking for matches in. But Cstring is inappropriate here
and possibly in a few other places in this change.


Reply to this email directly or view it on GitHub
https://github.com/JuliaLang/julia/pull/11447/files#r31456497.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I'm working on a patch that fixes this and a variety of other ccall signature problems. Some of which are probably my fault since they're inherited from the original PCRE code. We didn't have all the necessary Cfoo type aliases back then (or any of them, actually), so I just used the types that happened to be correct on my platform.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Thanks for fixing the ccall problems and merging this, Stefan.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No problem. Thanks again for pushing it forward.

# rc == -1 means no match, -2 means partial match.
rc < -2 && error("PCRE.exec error: $(err_message(rc))")
rc >= 0
end

function exec(regex::Ptr{Void}, extra::Ptr{Void},
str::ByteString, shift::Integer, offset::Integer,
len::Integer, options::Integer,
ovec::Vector{Int32})
if offset < 0 || len < offset || len+shift > sizeof(str)
throw(BoundsError())
end
n = ccall((:pcre_exec, :libpcre), Int32,
(Ptr{Void}, Ptr{Void}, Ptr{UInt8}, Int32,
Int32, Int32, Ptr{Int32}, Int32),
regex, extra, pointer(str.data,shift+1), len,
offset, options, ovec, length(ovec))
n < -1 && error("PCRE.exec error code $n")
return n > -1
function create_match_data(re)
ccall((:pcre2_match_data_create_from_pattern_8, PCRE_LIB),
Ptr{Void}, (Ptr{Void}, Ptr{Void}), re, C_NULL)
end

function substring_number_from_name(re, name)
ccall((:pcre2_substring_number_from_name_8, PCRE_LIB), Cint,
(Ptr{Void}, Cstring), re, name)
end


end # module
Loading