diff --git a/NEWS.md b/NEWS.md index c6a41f44e00c9..6e344b97f7bc5 100644 --- a/NEWS.md +++ b/NEWS.md @@ -120,6 +120,10 @@ Standard library changes * Some degree trigonometric functions, `sind`, `cosd`, `tand`, `asind`, `acosd`, `asecd`, `acscd`, `acotd`, `atand` now accept an square matrix ([#39758]). * A backslash before a newline in command literals now always removes the newline, similar to standard string literals, whereas the result was not well-defined before. ([#40753]) +* `replace(::String)` now allows multiple patterns to be specified, and they + will be applied left-to-right simultaneously, so only one pattern will be + applied to any character, and the patterns will only be applied to the input + text, not the replacements. ([#TBD]) #### Package Manager diff --git a/base/regex.jl b/base/regex.jl index 82e2042552ee4..15744fe14ce47 100644 --- a/base/regex.jl +++ b/base/regex.jl @@ -589,7 +589,7 @@ _free_pat_replacer(r::RegexAndMatchData) = PCRE.free_match_data(r.match_data) replace_err(repl) = error("Bad replacement string: $repl") -function _write_capture(io, re::RegexAndMatchData, group) +function _write_capture(io::IO, group::Int, str, r, re::RegexAndMatchData) len = PCRE.substring_length_bynumber(re.match_data, group) # in the case of an optional group that doesn't match, len == 0 len == 0 && return @@ -598,6 +598,11 @@ function _write_capture(io, re::RegexAndMatchData, group) pointer(io.data, io.ptr), len+1) io.ptr += len io.size = max(io.size, io.ptr - 1) + nothing +end +function _write_capture(io::IO, group::Int, str, r, re) + group == 0 || replace_err("pattern is not a Regex") + return print(io, SubString(str, r)) end @@ -605,7 +610,7 @@ const SUB_CHAR = '\\' const GROUP_CHAR = 'g' const KEEP_ESC = [SUB_CHAR, GROUP_CHAR, '0':'9'...] -function _replace(io, repl_s::SubstitutionString, str, r, re::RegexAndMatchData) +function _replace(io, repl_s::SubstitutionString, str, r, re) LBRACKET = '<' RBRACKET = '>' repl = unescape_string(repl_s.string, KEEP_ESC) @@ -629,7 +634,7 @@ function _replace(io, repl_s::SubstitutionString, str, r, re::RegexAndMatchData) break end end - _write_capture(io, re, group) + _write_capture(io, group, str, r, re) elseif repl[next_i] == GROUP_CHAR i = nextind(repl, next_i) if i > e || repl[i] != LBRACKET @@ -642,15 +647,16 @@ function _replace(io, repl_s::SubstitutionString, str, r, re::RegexAndMatchData) i = nextind(repl, i) i > e && replace_err(repl) end - # TODO: avoid this allocation groupname = SubString(repl, groupstart, prevind(repl, i)) if all(isdigit, groupname) - _write_capture(io, re, parse(Int, groupname)) - else + group = parse(Int, groupname) + elseif re isa RegexAndMatchData group = PCRE.substring_number_from_name(re.re.regex, groupname) group < 0 && replace_err("Group $groupname not found in regex $(re.re)") - _write_capture(io, re, group) + else + group = -1 end + _write_capture(io, group, str, r, re) i = nextind(repl, i) else replace_err(repl) diff --git a/base/set.jl b/base/set.jl index 0c8a8b95b10ce..5a744c556432c 100644 --- a/base/set.jl +++ b/base/set.jl @@ -621,7 +621,6 @@ replace!(a::Callable, b::Pair; count::Integer=-1) = throw(MethodError(replace!, replace!(a::Callable, b::Pair, c::Pair; count::Integer=-1) = throw(MethodError(replace!, (a, b, c))) replace(a::Callable, b::Pair; count::Integer=-1) = throw(MethodError(replace, (a, b))) replace(a::Callable, b::Pair, c::Pair; count::Integer=-1) = throw(MethodError(replace, (a, b, c))) -replace(a::AbstractString, b::Pair, c::Pair) = throw(MethodError(replace, (a, b, c))) ### replace! for AbstractDict/AbstractSet diff --git a/base/strings/util.jl b/base/strings/util.jl index 9bc043513deed..25ac746455b28 100644 --- a/base/strings/util.jl +++ b/base/strings/util.jl @@ -522,56 +522,74 @@ _replace(io, repl::Function, str, r, pattern) = _replace(io, repl::Function, str, r, pattern::Function) = print(io, repl(str[first(r)])) -replace(str::String, pat_repl::Pair{<:AbstractChar}; count::Integer=typemax(Int)) = - replace(str, isequal(first(pat_repl)) => last(pat_repl); count=count) - -replace(str::String, pat_repl::Pair{<:Union{Tuple{Vararg{AbstractChar}}, - AbstractVector{<:AbstractChar},Set{<:AbstractChar}}}; - count::Integer=typemax(Int)) = - replace(str, in(first(pat_repl)) => last(pat_repl), count=count) - _pat_replacer(x) = x _free_pat_replacer(x) = nothing -function replace(str::String, pat_repl::Pair; count::Integer=typemax(Int)) - pattern, repl = pat_repl +_pat_replacer(x::AbstractChar) = isequal(x) +_pat_replacer(x::Union{Tuple{Vararg{AbstractChar}},AbstractVector{<:AbstractChar},Set{<:AbstractChar}}) = in(x) + +function replace(str::String, pat_repl::Vararg{Pair,N}; count::Integer=typemax(Int)) where N count == 0 && return str count < 0 && throw(DomainError(count, "`count` must be non-negative.")) n = 1 - e = lastindex(str) + e1 = nextind(str, lastindex(str)) # sizeof(str) i = a = firstindex(str) - pattern = _pat_replacer(pattern) - r = something(findnext(pattern,str,i), 0) - j, k = first(r), last(r) - if j == 0 - _free_pat_replacer(pattern) + patterns = map(p -> _pat_replacer(first(p)), pat_repl) + replaces = map(last, pat_repl) + rs = map(patterns) do p + r = findnext(p, str, a) + if r === nothing || first(r) == 0 + return e1+1:0 + end + r isa Int && (r = r:r) # findnext / performance fix + return r + end + if all(>(e1), map(first, rs)) + foreach(_free_pat_replacer, patterns) return str end out = IOBuffer(sizehint=floor(Int, 1.2sizeof(str))) - while j != 0 + while true + p = argmin(map(first, rs)) # TODO: or argmin(rs), to pick the shortest first match ? + r = rs[p] + j, k = first(r), last(r) + j > e1 && break if i == a || i <= k + # copy out preserved portion GC.@preserve str unsafe_write(out, pointer(str, i), UInt(j-i)) - _replace(out, repl, str, r, pattern) + # copy out replacement string + _replace(out, replaces[p], str, r, patterns[p]) end if k < j i = j - j > e && break + j == e1 && break k = nextind(str, j) else i = k = nextind(str, k) end - r = something(findnext(pattern,str,k), 0) - r === 0:-1 || n == count && break - j, k = first(r), last(r) + n == count && break + let k = k + rs = map(patterns, rs) do p, r + if first(r) < k + r = findnext(p, str, k) + if r === nothing || first(r) == 0 + return e1+1:0 + end + r isa Int && (r = r:r) # findnext / performance fix + end + return r + end + end n += 1 end - _free_pat_replacer(pattern) - write(out, SubString(str,i)) - String(take!(out)) + foreach(_free_pat_replacer, patterns) + write(out, SubString(str, i)) + return String(take!(out)) end + """ - replace(s::AbstractString, pat=>r; [count::Integer]) + replace(s::AbstractString, pat=>r, [pat2=>r2, ...]; [count::Integer]) Search for the given pattern `pat` in `s`, and replace each occurrence with `r`. If `count` is provided, replace at most `count` occurrences. @@ -584,6 +602,13 @@ If `pat` is a regular expression and `r` is a [`SubstitutionString`](@ref), then references in `r` are replaced with the corresponding matched text. To remove instances of `pat` from `string`, set `r` to the empty `String` (`""`). +Multiple patterns can be specified, and they will be applied left-to-right +simultaneously, so only one pattern will be applied to any character, and the +patterns will only be applied to the input text, not the replacements. + +!!! compat "Julia 1.7" + Support for multiple patterns requires version 1.7. + # Examples ```jldoctest julia> replace("Python is a programming language.", "Python" => "Julia") @@ -597,10 +622,13 @@ julia> replace("The quick foxes run quickly.", "quick" => "", count=1) julia> replace("The quick foxes run quickly.", r"fox(es)?" => s"bus\\1") "The quick buses run quickly." + +julia> replace("abcabc", "a" => "b", "b" => "c", r".+" => "a") +"bca" ``` """ -replace(s::AbstractString, pat_f::Pair; count=typemax(Int)) = - replace(String(s), pat_f, count=count) +replace(s::AbstractString, pat_f::Pair...; count=typemax(Int)) = + replace(String(s), pat_f..., count=count) # TODO: allow transform as the first argument to replace? diff --git a/test/strings/util.jl b/test/strings/util.jl index e8ea3b643fcda..d4560b3d33cf6 100644 --- a/test/strings/util.jl +++ b/test/strings/util.jl @@ -307,6 +307,178 @@ end end +@testset "replace many" begin + # PR 35414 Francesco Alemanno + @test replace("foobarbaz", "oo" => "zz", "ar" => "zz", "z" => "m") == "fzzbzzbam" + substmp=["z" => "m", "oo" => "zz", "ar" => "zz"] + for perm in [[1, 2, 3], [2, 1, 3], [3, 2, 1], [2, 3, 1], [1, 3, 2], [3, 1, 2]] + @test replace("foobarbaz", substmp[perm]...) == "fzzbzzbam" + @test replace("foobarbaz", substmp[perm]..., count=2) == "fzzbzzbaz" + @test replace("foobarbaz", substmp[perm]..., count=1) == "fzzbarbaz" + end + @test replace("foobarbaz", "z" => "m", r"a.*a" => uppercase) == "foobARBAm" + @test replace("foobarbaz", 'o' => 'z', 'a' => 'q', 'z' => 'm') == "fzzbqrbqm" + + + # PR #25732 Klaus Crusius + @test replace("\u2202", '*' => '\0', "" => "") == "\u2202" + + @test replace("foobar", 'o' => '0', "" => "") == "f00bar" + @test replace("foobar", 'o' => '0', count=1, "" => "") == "foobar" + @test replace("foobar", 'o' => '0', count=2, "" => "") == "f0obar" + @test replace("foobar", 'o' => "", "" => "") == "fbar" + @test replace("foobar", 'o' => "", count=1, "" => "") == "foobar" + @test replace("foobar", 'o' => "", count=2, "" => "") == "fobar" + @test replace("foobar", 'f' => 'F', "" => "") == "Foobar" + @test replace("foobar", 'r' => 'R', "" => "") == "foobaR" + + @test replace("foofoofoo", "foo" => "bar", "" => "") == "barbarbar" + @test replace("foobarfoo", "foo" => "baz", "" => "") == "bazbarbaz" + @test replace("barfoofoo", "foo" => "baz", "" => "") == "barbazbaz" + + @test replace("", "" => "", "" => "") == "" + @test replace("", "" => "x", "" => "") == "x" + @test replace("", "x" => "y", "" => "") == "" + + @test replace("abcd", "" => "^", "" => "") == "^a^b^c^d^" + @test replace("abcd", "b" => "^", "" => "") == "a^cd" + @test replace("abcd", r"b?" => "^", "" => "") == "^a^c^d^" + @test replace("abcd", r"b+" => "^", "" => "") == "a^cd" + @test replace("abcd", r"b?c?" => "^", "" => "") == "^a^d^" + @test replace("abcd", r"[bc]?" => "^", "" => "") == "^a^^d^" + + @test replace("foobarfoo", r"(fo|ba)" => "xx", "" => "") == "xxoxxrxxo" + @test replace("foobarfoo", r"(foo|ba)" => "bar", "" => "") == "barbarrbar" + + @test replace("foobar", 'o' => 'ø', "" => "") == "føøbar" + @test replace("foobar", 'o' => 'ø', count=2, "" => "") == "føobar" + @test replace("føøbar", 'ø' => 'o', "" => "") == "foobar" + @test replace("føøbar", 'ø' => 'o', count=2, "" => "") == "foøbar" + @test replace("føøbar", 'ø' => 'ö', "" => "") == "fööbar" + @test replace("føøbar", 'ø' => 'ö', count=2, "" => "") == "föøbar" + @test replace("føøbar", 'ø' => "", "" => "") == "fbar" + @test replace("føøbar", 'ø' => "", count=2, "" => "") == "føbar" + @test replace("føøbar", 'f' => 'F', "" => "") == "Føøbar" + @test replace("ḟøøbar", 'ḟ' => 'F', "" => "") == "Føøbar" + @test replace("føøbar", 'f' => 'Ḟ', "" => "") == "Ḟøøbar" + @test replace("ḟøøbar", 'ḟ' => 'Ḟ', "" => "") == "Ḟøøbar" + @test replace("føøbar", 'r' => 'R', "" => "") == "føøbaR" + @test replace("føøbaṙ", 'ṙ' => 'R', "" => "") == "føøbaR" + @test replace("føøbar", 'r' => 'Ṙ', "" => "") == "føøbaṘ" + @test replace("føøbaṙ", 'ṙ' => 'Ṙ', "" => "") == "føøbaṘ" + + @test replace("ḟøøḟøøḟøø", "ḟøø" => "bar", "" => "") == "barbarbar" + @test replace("ḟøøbarḟøø", "ḟøø" => "baz", "" => "") == "bazbarbaz" + @test replace("barḟøøḟøø", "ḟøø" => "baz", "" => "") == "barbazbaz" + + @test replace("foofoofoo", "foo" => "ƀäṙ", "" => "") == "ƀäṙƀäṙƀäṙ" + @test replace("fooƀäṙfoo", "foo" => "baz", "" => "") == "bazƀäṙbaz" + @test replace("ƀäṙfoofoo", "foo" => "baz", "" => "") == "ƀäṙbazbaz" + + @test replace("foofoofoo", "foo" => "bar", "" => "") == "barbarbar" + @test replace("foobarfoo", "foo" => "ƀäż", "" => "") == "ƀäżbarƀäż" + @test replace("barfoofoo", "foo" => "ƀäż", "" => "") == "barƀäżƀäż" + + @test replace("ḟøøḟøøḟøø", "ḟøø" => "ƀäṙ", "" => "") == "ƀäṙƀäṙƀäṙ" + @test replace("ḟøøƀäṙḟøø", "ḟøø" => "baz", "" => "") == "bazƀäṙbaz" + @test replace("ƀäṙḟøøḟøø", "ḟøø" => "baz", "" => "") == "ƀäṙbazbaz" + + @test replace("ḟøøḟøøḟøø", "ḟøø" => "bar", "" => "") == "barbarbar" + @test replace("ḟøøbarḟøø", "ḟøø" => "ƀäż", "" => "") == "ƀäżbarƀäż" + @test replace("barḟøøḟøø", "ḟøø" => "ƀäż", "" => "") == "barƀäżƀäż" + + @test replace("ḟøøḟøøḟøø", "ḟøø" => "ƀäṙ", "" => "") == "ƀäṙƀäṙƀäṙ" + @test replace("ḟøøƀäṙḟøø", "ḟøø" => "ƀäż", "" => "") == "ƀäżƀäṙƀäż" + @test replace("ƀäṙḟøøḟøø", "ḟøø" => "ƀäż", "" => "") == "ƀäṙƀäżƀäż" + + @test replace("", "" => "ẍ", "" => "") == "ẍ" + @test replace("", "ẍ" => "ÿ", "" => "") == "" + + @test replace("äƀçđ", "" => "π", "" => "") == "πäπƀπçπđπ" + @test replace("äƀçđ", "ƀ" => "π", "" => "") == "äπçđ" + @test replace("äƀçđ", r"ƀ?" => "π", "" => "") == "πäπçπđπ" + @test replace("äƀçđ", r"ƀ+" => "π", "" => "") == "äπçđ" + @test replace("äƀçđ", r"ƀ?ç?" => "π", "" => "") == "πäπđπ" + @test replace("äƀçđ", r"[ƀç]?" => "π", "" => "") == "πäππđπ" + + @test replace("foobarfoo", r"(fo|ba)" => "ẍẍ", "" => "") == "ẍẍoẍẍrẍẍo" + + @test replace("ḟøøbarḟøø", r"(ḟø|ba)" => "xx", "" => "") == "xxøxxrxxø" + @test replace("ḟøøbarḟøø", r"(ḟøø|ba)" => "bar", "" => "") == "barbarrbar" + + @test replace("fooƀäṙfoo", r"(fo|ƀä)" => "xx", "" => "") == "xxoxxṙxxo" + @test replace("fooƀäṙfoo", r"(foo|ƀä)" => "ƀäṙ", "" => "") == "ƀäṙƀäṙṙƀäṙ" + + @test replace("ḟøøƀäṙḟøø", r"(ḟø|ƀä)" => "xx", "" => "") == "xxøxxṙxxø" + @test replace("ḟøøƀäṙḟøø", r"(ḟøø|ƀä)" => "ƀäṙ", "" => "") == "ƀäṙƀäṙṙƀäṙ" + + @test replace("foo", "oo" => uppercase, "" => "") == "fOO" + + # Issue 13332 + @test replace("abc", 'b' => 2.1, "" => "") == "a2.1c" + + # test replace with a count for String and GenericString + # check that replace is a no-op if count==0 + for s in ["aaa", Test.GenericString("aaa")] + @test_throws DomainError replace(s, 'a' => "", count = -1, "" => "") + @test replace(s, 'a' => 'z', count=0, "" => "")::String == s + @test replace(s, 'a' => 'z', count=1, "" => "") == "zaa" + @test replace(s, 'a' => 'z', count=2, "" => "") == "zza" + @test replace(s, 'a' => 'z', count=3, "" => "") == "zzz" + @test replace(s, 'a' => 'z', count=4, "" => "") == "zzz" + @test replace(s, 'a' => 'z', count=typemax(Int), "" => "") == "zzz" + @test replace(s, 'a' => 'z', "" => "") == "zzz" + end + + let s = "abc" + @test replace(s) === s + @test replace(s, 'a' => 'z', "" => "") === "zbc" + @test replace(s, 'a' => 'z', 'b' => 'y') == "zyc" + @test replace(s, 'a' => 'z', 'c' => 'x', "b" => 'y') == "zyx" + @test replace(s, '1' => 'z', "" => "") == s + @test replace(s, 'b' => "BbB", "" => "", count=2) == "aBbBc" + end + + let s = "quick quicker quickest" + @test replace(s) === s + @test replace(s, "quickest" => 'z', "quicker" => uppercase, "quick" => 'a') == "a QUICKER z" + @test replace(s, "quick" => 'a', "quicker" => uppercase, "quickest" => 'z') == "a aer aest" + @test replace(s, "quickest" => "lame", "quicker" => "is", "quick" => "Duck", count=2) == "Duck is quickest" + @test "1q1u1i1c1k1 1q1u1i1c1k1e1r1 1q1u1i1c1k1e1s1t1" == + replace(s, "" => '1', "" => "") == + replace(s, "" => '1', "" => '2') + @test replace(s, "qu" => "QU", "qu" => "never happens", "ick" => "") == "QU QUer QUest" + @test replace(s, " " => '_', "r " => "r-") == "quick_quicker-quickest" + @test replace(s, r"[aeiou]" => "ä", "ui" => "ki", "i" => "I") == "qääck qääckär qääckäst" + @test replace(s, "i" => "I", "ui" => "ki", r"[aeiou]" => "ä") == "qkick qkickär qkickäst" + @test replace(s, r"[^ ]+" => "word", "quicker " => "X", count=big"99") == "word word word" + @test replace(s, "quicker " => "X", r"[^ ]+" => "word", count=big"99") == "word Xword" + + @test replace(s, r"(quick)(e)" => s"\2-\1", "x" => "X") == "quick e-quickr e-quickst" + + @test replace(s, 'q' => 'Q', 'u' => 'U') == "QUick QUicker QUickest" + @test replace(s, 'q' => 'Q', r"u" => 'U') == "QUick QUicker QUickest" + @test replace(s, 'q' => 'Q', ==('u') => uppercase) == "QUick QUicker QUickest" + @test replace(s, 'q' => 'Q', islowercase => '-') == "Q---- Q------ Q-------" + @test replace(s, ['q', 'u'] => 'K') == "KKick KKicker KKickest" + @test replace(s, occursin("uq") => 'K') == "KKick KKicker KKickest" + @test replace(s, ==('q') => "B") == "Buick Buicker Buickest" + + @test replace(s, "qui" => "A", 'r' => 'R') == "Ack AckeR Ackest" + @test replace(s, 'r' => 'x', islowercase => uppercase) == "QUICK QUICKEx QUICKEST" + @test replace(s, islowercase => uppercase, 'r' => 'x') == "QUICK QUICKER QUICKEST" + @test replace(s, "q" => "z", islowercase => uppercase, 'r' => 'x') == "zUICK zUICKER zUICKEST" + @test replace(s, "qui" => "A", 'r' => 'x', islowercase => uppercase) == "ACK ACKEx ACKEST" + @test replace(s, "qui" => "A", 'r' => 'x', islowercase => uppercase) == "ACK ACKEx ACKEST" + @test replace(s, r"q" => "z", islowercase => uppercase, 'r' => 'x') == "zUICK zUICKER zUICKEST" + + @test replace(s, "q" => s"a\0b") == "aqbuick aqbuicker aqbuickest" + @test replace(s, "q" => s"a\0b\n\\\g<0>") == "aqb\n\\quick aqb\n\\quicker aqb\n\\quickest" + @test_throws ErrorException("PCRE error: unknown substring") replace(s, r"q" => s"a\1b") + @test_throws ErrorException("Bad replacement string: pattern is not a Regex") replace(s, "q" => s"a\1b") + end +end + @testset "chomp/chop" begin @test chomp("foo\n") == "foo" @test chomp("fo∀\n") == "fo∀"