Skip to content

Commit

Permalink
StringScanner#scan_integer support base 16 integers
Browse files Browse the repository at this point in the history
Followup: ruby#115

`scan_integer` is now implemented in Ruby as to efficiently handle
keyword arguments without allocating a Hash. Given the goal of `scan_integer`
is to more effciently parse integers without having to allocate an intermediary
object, using `rb_scan_args` would defeat the purpose.

Additionally, the C implementation now uses `rb_isdigit` and `rb_isxdigit`,
because on Windows `isdigit` is locale dependent.
  • Loading branch information
byroot committed Nov 26, 2024
1 parent 6a3c74b commit e68e38f
Show file tree
Hide file tree
Showing 6 changed files with 182 additions and 23 deletions.
2 changes: 0 additions & 2 deletions ext/jruby/lib/strscan.rb

This file was deleted.

45 changes: 43 additions & 2 deletions ext/jruby/org/jruby/ext/strscan/RubyStringScanner.java
Original file line number Diff line number Diff line change
Expand Up @@ -557,8 +557,8 @@ public IRubyObject peep(ThreadContext context, IRubyObject length) {
return peek(context, length);
}

@JRubyMethod(name = "scan_integer")
public IRubyObject scan_integer(ThreadContext context) {
@JRubyMethod(name = "scan_base10_integer", visibility = PRIVATE)
public IRubyObject scan_base10_integer(ThreadContext context) {
final Ruby runtime = context.runtime;
check(context);
clearMatched();
Expand Down Expand Up @@ -598,6 +598,47 @@ public IRubyObject scan_integer(ThreadContext context) {
return ConvertBytes.byteListToInum(runtime, bytes, prev, curr, 10, true);
}

@JRubyMethod(name = "scan_base16_integer", visibility = PRIVATE)
public IRubyObject scan_base16_integer(ThreadContext context) {
final Ruby runtime = context.runtime;
check(context);
clearMatched();

if (!str.getEncoding().isAsciiCompatible()) {
throw runtime.newEncodingCompatibilityError("ASCII incompatible encoding: " + str.getEncoding());
}


ByteList bytes = str.getByteList();
int curr = this.curr;

int bite = bytes.get(curr);
if (bite == '0' && bytes.get(curr + 1) == 'x') {
curr += 2;
bite = bytes.get(curr);
}

if (!((bite >= '0' && bite <= '9') || (bite >= 'a' && bite <= 'f') || (bite >= 'A' && bite <= 'F'))) {
return context.nil;
}

while ((bite >= '0' && bite <= '9') || (bite >= 'a' && bite <= 'f') || (bite >= 'A' && bite <= 'F')) {
curr++;
if (curr >= bytes.getRealSize()) {
break;
}
bite = bytes.get(curr);
}

int length = curr - this.curr;
prev = this.curr;
this.curr = curr;
setMatched();
adjustRegisters();

return ConvertBytes.byteListToInum(runtime, bytes, prev, curr, 16, true);
}

@JRubyMethod(name = "unscan")
public IRubyObject unscan(ThreadContext context) {
check(context);
Expand Down
68 changes: 53 additions & 15 deletions ext/strscan/strscan.c
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,6 @@
extern size_t onig_region_memsize(const struct re_registers *regs);
#endif

#include <ctype.h>
#include <stdbool.h>

#define STRSCAN_VERSION "3.1.1"
Expand Down Expand Up @@ -116,7 +115,7 @@ static VALUE strscan_get_byte _((VALUE self));
static VALUE strscan_getbyte _((VALUE self));
static VALUE strscan_peek _((VALUE self, VALUE len));
static VALUE strscan_peep _((VALUE self, VALUE len));
static VALUE strscan_scan_integer _((VALUE self));
static VALUE strscan_scan_base10_integer _((VALUE self));
static VALUE strscan_unscan _((VALUE self));
static VALUE strscan_bol_p _((VALUE self));
static VALUE strscan_eos_p _((VALUE self));
Expand Down Expand Up @@ -1268,17 +1267,8 @@ strscan_peep(VALUE self, VALUE vlen)
return strscan_peek(self, vlen);
}

/*
* call-seq:
* scan_integer
*
* Equivalent to #scan with a [+-]?\d+ pattern, and returns an Integer or nil.
*
* The scanned string must be encoded with an ASCII compatible encoding, otherwise
* Encoding::CompatibilityError will be raised.
*/
static VALUE
strscan_scan_integer(VALUE self)
strscan_scan_base10_integer(VALUE self)
{
char *ptr, *buffer;
long len = 0;
Expand All @@ -1302,14 +1292,14 @@ strscan_scan_integer(VALUE self)
len++;
}

if (!isdigit(ptr[len])) {
if (!rb_isdigit(ptr[len])) {
return Qnil;
}

MATCHED(p);
p->prev = p->curr;

while (len < remaining_len && isdigit(ptr[len])) {
while (len < remaining_len && rb_isdigit(ptr[len])) {
len++;
}

Expand All @@ -1323,6 +1313,53 @@ strscan_scan_integer(VALUE self)
return integer;
}


static VALUE
strscan_scan_base16_integer(VALUE self)
{
char *ptr, *buffer;
long len = 0;
VALUE buffer_v, integer;
struct strscanner *p;

GET_SCANNER(self, p);
CLEAR_MATCH_STATUS(p);

rb_must_asciicompat(p->str);

ptr = CURPTR(p);

long remaining_len = S_RESTLEN(p);

if (remaining_len <= 0) {
return Qnil;
}

if (remaining_len >= 2 && ptr[len] == '0' && ptr[len + 1] == 'x') {
len += 2;
}

if (len >= remaining_len || !rb_isxdigit(ptr[len])) {
return Qnil;
}

MATCHED(p);
p->prev = p->curr;

while (len < remaining_len && rb_isxdigit(ptr[len])) {
len++;
}

buffer = RB_ALLOCV_N(char, buffer_v, len + 1);

MEMCPY(buffer, CURPTR(p), char, len);
buffer[len] = '\0';
integer = rb_cstr2inum(buffer, 16);
RB_ALLOCV_END(buffer_v);
p->curr += len;
return integer;
}

/*
* :markup: markdown
* :include: strscan/link_refs.txt
Expand Down Expand Up @@ -2261,7 +2298,8 @@ Init_strscan(void)
rb_define_method(StringScanner, "peek_byte", strscan_peek_byte, 0);
rb_define_method(StringScanner, "peep", strscan_peep, 1);

rb_define_method(StringScanner, "scan_integer", strscan_scan_integer, 0);
rb_define_private_method(StringScanner, "scan_base10_integer", strscan_scan_base10_integer, 0);
rb_define_private_method(StringScanner, "scan_base16_integer", strscan_scan_base16_integer, 0);

rb_define_method(StringScanner, "unscan", strscan_unscan, 0);

Expand Down
32 changes: 32 additions & 0 deletions lib/strscan.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
# frozen_string_literal: true

if RUBY_PLATFORM == "java"
require 'strscan.jar'
JRuby::Util.load_ext("org.jruby.ext.strscan.StringScannerLibrary")
else
require "strscan.so"
end

class StringScanner
# call-seq:
# scan_integer(base: 10)
#
# If `base` isn't provided or is `10`, then it is equivalent to calling `#scan` with a `[+-]?\d+` pattern,
# and returns an Integer or nil.
#
# If `base` is `16`, then it is equivalent to calling `#scan` with a `(0x)?[0-9a-fA-F]+` pattern,
# and returns an Integer or nil.
#
# The scanned string must be encoded with an ASCII compatible encoding, otherwise
# Encoding::CompatibilityError will be raised.
def scan_integer(base: 10)
case base
when 10
scan_base10_integer
when 16
scan_base16_integer
else
raise ArgumentError, "Unsupported integer base: #{base.inspect}, expected 10 or 16"
end
end
end
8 changes: 4 additions & 4 deletions strscan.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -19,14 +19,14 @@ Gem::Specification.new do |s|
files = [
"COPYING",
"LICENSE.txt",
]
] + Dir[File.join(__dir__, "lib/**/*.rb")]

s.require_paths = %w{lib}

if RUBY_ENGINE == "jruby"
s.require_paths = %w{ext/jruby/lib lib}
files << "ext/jruby/lib/strscan.rb"
files << "lib/strscan.jar"
s.platform = "java"
else
s.require_paths = %w{lib}
files << "ext/strscan/extconf.rb"
files << "ext/strscan/strscan.c"
s.rdoc_options << "-idoc"
Expand Down
50 changes: 50 additions & 0 deletions test/strscan/test_stringscanner.rb
Original file line number Diff line number Diff line change
Expand Up @@ -945,6 +945,56 @@ def test_scan_integer_encoding
s.scan_integer
end
end

def test_scan_integer_base_16
omit "scan_integer isn't implemented on TruffleRuby yet" if RUBY_ENGINE == "truffleruby"

s = create_string_scanner('abc')
assert_equal 0xabc, s.scan_integer(base: 16)
assert_equal 3, s.pos
assert_predicate s, :matched?

s = create_string_scanner('123abc')
assert_equal 0x123abc, s.scan_integer(base: 16)
assert_equal 6, s.pos
assert_predicate s, :matched?

s = create_string_scanner('0x123abc')
assert_equal 0x123abc, s.scan_integer(base: 16)
assert_equal 8, s.pos
assert_predicate s, :matched?

s = create_string_scanner('0x123ABC')
assert_equal 0x123abc, s.scan_integer(base: 16)
assert_equal 8, s.pos
assert_predicate s, :matched?

s = create_string_scanner('0x')
assert_nil s.scan_integer(base: 16)
assert_equal 0, s.pos
refute_predicate s, :matched?

s = create_string_scanner('-123abc')
assert_nil s.scan_integer(base: 16)
assert_equal 0, s.pos
refute_predicate s, :matched?

s = create_string_scanner('+123')
assert_nil s.scan_integer(base: 16)
assert_equal 0, s.pos
refute_predicate s, :matched?

s = create_string_scanner('-abc')
assert_nil s.scan_integer(base: 16)
assert_equal 0, s.pos
refute_predicate s, :matched?

huge_integer = '1' * 2_000
s = create_string_scanner(huge_integer)
assert_equal huge_integer.to_i(16), s.scan_integer(base: 16)
assert_equal 2_000, s.pos
assert_predicate s, :matched?
end
end

class TestStringScanner < Test::Unit::TestCase
Expand Down

0 comments on commit e68e38f

Please sign in to comment.