From 50ed873c32a7275ba5ac577d41d8f23bc5504b87 Mon Sep 17 00:00:00 2001 From: Jean Boussier Date: Thu, 14 Nov 2024 08:44:42 +0100 Subject: [PATCH] Implement #scan_integer to efficiently parse Integer Fix: https://github.com/ruby/strscan/issues/113 This allows to directly parse an Integer from a String without needing to first allocate a sub string. Notes: The implementation is limited by design, it's meant as a first step, only the most straightforward, based 10 integers are supported. --- .../jruby/ext/strscan/RubyStringScanner.java | 42 ++++++++++++++++ ext/strscan/strscan.c | 48 ++++++++++++++++++ test/strscan/test_stringscanner.rb | 49 +++++++++++++++++++ 3 files changed, 139 insertions(+) diff --git a/ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java b/ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java index 7d3e7494fc..2455ef33ce 100644 --- a/ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java +++ b/ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java @@ -54,6 +54,7 @@ import org.jruby.runtime.builtin.IRubyObject; import org.jruby.util.ByteList; import org.jruby.util.StringSupport; +import org.jruby.util.ConvertBytes; import java.util.Iterator; @@ -556,6 +557,47 @@ public IRubyObject peep(ThreadContext context, IRubyObject length) { return peek(context, length); } + @JRubyMethod(name = "scan_integer") + public IRubyObject scan_integer(ThreadContext context) { + final Ruby runtime = context.runtime; + check(context); + clearMatched(); + + if (!str.getEncoding().isAsciiCompatible()) { + throw getRuntime().newEncodingCompatibilityError("ASCII incompatible encoding: " + str.getEncoding()); + } + + + ByteList bytes = str.getByteList(); + int curr = this.curr; + + int bite = bytes.get(curr); + if (bite == '-' || bite == '+') { + curr++; + bite = bytes.get(curr); + } + + if (!(bite >= '0' && bite <= '9')) { + return runtime.getNil(); + } + + while (bite >= '0' && bite <= '9') { + curr++; + if (curr >= bytes.getRealSize()) { + break; + } + bite = bytes.get(curr); + } + + int length = curr - this.curr; + prev = this.curr; + this.curr = curr; + setMatched(); + adjustRegisters(); + + return ConvertBytes.byteListToInum(runtime, bytes, prev, curr, 10, true); + } + @JRubyMethod(name = "unscan") public IRubyObject unscan(ThreadContext context) { check(context); diff --git a/ext/strscan/strscan.c b/ext/strscan/strscan.c index 0448b9c16b..6a6ab97f31 100644 --- a/ext/strscan/strscan.c +++ b/ext/strscan/strscan.c @@ -115,6 +115,7 @@ static VALUE strscan_get_byte _((VALUE self)); static VALUE strscan_getbyte _((VALUE self)); static VALUE strscan_peek _((VALUE self, VALUE len)); static VALUE strscan_peep _((VALUE self, VALUE len)); +static VALUE strscan_scan_integer _((VALUE self)); static VALUE strscan_unscan _((VALUE self)); static VALUE strscan_bol_p _((VALUE self)); static VALUE strscan_eos_p _((VALUE self)); @@ -1266,6 +1267,51 @@ strscan_peep(VALUE self, VALUE vlen) return strscan_peek(self, vlen); } +/* + * call-seq: + * scan_integer + * + * Equivalent to #scan with a \-?\d+ pattern, and returns an Integer or nil. + */ +static VALUE +strscan_scan_integer(VALUE self) +{ + struct strscanner *p; + + GET_SCANNER(self, p); + CLEAR_MATCH_STATUS(p); + + rb_must_asciicompat(p->str); + + char *ptr = CURPTR(p); + + long len = 0; + if (ptr[len] == '-' || ptr[len] == '+') { + len++; + } + + if (!isdigit(ptr[len])) { + return Qnil; + } + + MATCHED(p); + p->prev = p->curr; + + while(isdigit(ptr[len])) { + len++; + } + + VALUE buffer_v; + char *buffer = ALLOCV_N(char, buffer_v, len + 1); + + MEMCPY(buffer, CURPTR(p), char, len); + buffer[len] = '\0'; + VALUE integer = rb_cstr2inum(buffer, 10); + RB_GC_GUARD(buffer_v); + p->curr += len; + return integer; +} + /* * :markup: markdown * :include: strscan/link_refs.txt @@ -2204,6 +2250,8 @@ Init_strscan(void) rb_define_method(StringScanner, "peek_byte", strscan_peek_byte, 0); rb_define_method(StringScanner, "peep", strscan_peep, 1); + rb_define_method(StringScanner, "scan_integer", strscan_scan_integer, 0); + rb_define_method(StringScanner, "unscan", strscan_unscan, 0); rb_define_method(StringScanner, "beginning_of_line?", strscan_bol_p, 0); diff --git a/test/strscan/test_stringscanner.rb b/test/strscan/test_stringscanner.rb index 54fd5027cf..806867e9b1 100644 --- a/test/strscan/test_stringscanner.rb +++ b/test/strscan/test_stringscanner.rb @@ -890,6 +890,55 @@ def test_named_captures assert_equal(9, scan.match?(/(?foo)(?bar)(?baz)/)) assert_equal({"f" => "foo", "r" => "bar", "z" => "baz"}, scan.named_captures) end + + def test_scan_integer + s = create_string_scanner('abc') + assert_nil s.scan_integer + assert_equal 0, s.pos + refute_predicate s, :matched? + + s = create_string_scanner('123abc') + assert_equal 123, s.scan_integer + assert_equal 3, s.pos + assert_predicate s, :matched? + + s = create_string_scanner('-123abc') + assert_equal -123, s.scan_integer + assert_equal 4, s.pos + assert_predicate s, :matched? + + s = create_string_scanner('+123') + assert_equal 123, s.scan_integer + assert_equal 4, s.pos + assert_predicate s, :matched? + + s = create_string_scanner('-abc') + assert_nil s.scan_integer + assert_equal 0, s.pos + refute_predicate s, :matched? + + huge_integer = '1' * 2_000 + s = create_string_scanner(huge_integer) + assert_equal huge_integer.to_i, s.scan_integer + assert_equal 2_000, s.pos + assert_predicate s, :matched? + end + + def test_scan_integer_unmatch + s = create_string_scanner('123abc') + assert_equal 123, s.scan_integer + assert_equal 3, s.pos + + s.unscan + assert_equal 0, s.pos + end + + def test_scan_integer_encoding + s = create_string_scanner('123abc'.encode(Encoding::UTF_32LE)) + assert_raise(Encoding::CompatibilityError) do + s.scan_integer + end + end end class TestStringScanner < Test::Unit::TestCase