Skip to content

Commit

Permalink
Fix backward search with .* (Issue #69)
Browse files Browse the repository at this point in the history
This bug was caused when I implemented implicit anchor optimization.
Actually the changes for implicit anchor optimization were totally
useless, because Oniguruma already had the optimization. (Oniguruma
has another bug with the optimization, though.)

Revert the following commits:

* 4a2a618
* 3ea0268
* e20e852

Also add some tests.
  • Loading branch information
k-takata committed Dec 12, 2016
1 parent 1805c8f commit ebccac2
Show file tree
Hide file tree
Showing 4 changed files with 9 additions and 56 deletions.
45 changes: 5 additions & 40 deletions regcomp.c
Original file line number Diff line number Diff line change
Expand Up @@ -1484,9 +1484,6 @@ compile_anchor_node(AnchorNode* node, regex_t* reg)
case ANCHOR_SEMI_END_BUF: r = add_opcode(reg, OP_SEMI_END_BUF); break;
case ANCHOR_BEGIN_POSITION: r = add_opcode(reg, OP_BEGIN_POSITION); break;

/* used for implicit anchor optimization: /.*a/ ==> /(?:^|\G).*a/ */
case ANCHOR_ANYCHAR_STAR: r = add_opcode(reg, OP_BEGIN_POS_OR_LINE); break;

case ANCHOR_WORD_BOUND:
if (node->ascii_range) r = add_opcode(reg, OP_ASCII_WORD_BOUND);
else r = add_opcode(reg, OP_WORD_BOUND);
Expand Down Expand Up @@ -3295,7 +3292,7 @@ setup_look_behind(Node* node, regex_t* reg, ScanEnv* env)
}

static int
next_setup(Node* node, Node* next_node, int in_root, regex_t* reg)
next_setup(Node* node, Node* next_node, regex_t* reg)
{
int type;

Expand Down Expand Up @@ -3329,32 +3326,10 @@ next_setup(Node* node, Node* next_node, int in_root, regex_t* reg)
}
}
}

#ifndef ONIG_DONT_OPTIMIZE
if (NTYPE(node) == NT_QTFR && /* the type may be changed by above block */
in_root && /* qn->lower == 0 && */
NTYPE(qn->target) == NT_CANY &&
! IS_MULTILINE(reg->options)) {
/* implicit anchor: /.*a/ ==> /(?:^|\G).*a/ */
Node *np;
np = onig_node_new_list(NULL_NODE, NULL_NODE);
CHECK_NULL_RETURN_MEMERR(np);
swap_node(node, np);
NCDR(node) = onig_node_new_list(np, NULL_NODE);
if (IS_NULL(NCDR(node))) {
onig_node_free(np);
return ONIGERR_MEMORY;
}
np = onig_node_new_anchor(ANCHOR_ANYCHAR_STAR); /* (?:^|\G) */
CHECK_NULL_RETURN_MEMERR(np);
NCAR(node) = np;
}
#endif
}
}
else if (type == NT_ENCLOSE) {
EncloseNode* en = NENCLOSE(node);
in_root = 0;
if (en->type == ENCLOSE_MEMORY) {
node = en->target;
goto retry;
Expand Down Expand Up @@ -3852,9 +3827,8 @@ setup_comb_exp_check(Node* node, int state, ScanEnv* env)
#define IN_NOT (1<<1)
#define IN_REPEAT (1<<2)
#define IN_VAR_REPEAT (1<<3)
#define IN_ROOT (1<<4)
#define IN_CALL (1<<5)
#define IN_RECCALL (1<<6)
#define IN_CALL (1<<4)
#define IN_RECCALL (1<<5)

/* setup_tree does the following work.
1. check empty loop. (set qn->target_empty_info)
Expand All @@ -3869,25 +3843,19 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
{
int type;
int r = 0;
int in_root = state & IN_ROOT;

state &= ~IN_ROOT;
restart:
type = NTYPE(node);
switch (type) {
case NT_LIST:
{
Node* prev = NULL_NODE;
int prev_in_root = 0;
state |= in_root;
do {
r = setup_tree(NCAR(node), reg, state, env);
if (IS_NOT_NULL(prev) && r == 0) {
r = next_setup(prev, NCAR(node), prev_in_root, reg);
r = next_setup(prev, NCAR(node), reg);
}
prev = NCAR(node);
prev_in_root = state & IN_ROOT;
state &= ~IN_ROOT;
} while (r == 0 && IS_NOT_NULL(node = NCDR(node)));
}
break;
Expand Down Expand Up @@ -4051,7 +4019,6 @@ setup_tree(Node* node, regex_t* reg, int state, ScanEnv* env)
case ENCLOSE_OPTION:
{
OnigOptionType options = reg->options;
state |= in_root;
reg->options = NENCLOSE(node)->option;
r = setup_tree(NENCLOSE(node)->target, reg, state, env);
reg->options = options;
Expand Down Expand Up @@ -5782,7 +5749,7 @@ onig_compile(regex_t* reg, const UChar* pattern, const UChar* pattern_end,
reg->num_call = 0;
#endif

r = setup_tree(root, reg, IN_ROOT, &scan_env);
r = setup_tree(root, reg, 0, &scan_env);
if (r != 0) goto err_unset;

#ifdef ONIG_DEBUG_PARSE_TREE
Expand Down Expand Up @@ -6173,7 +6140,6 @@ OnigOpInfoType OnigOpInfo[] = {
{ OP_END_LINE, "end-line", ARG_NON },
{ OP_SEMI_END_BUF, "semi-end-buf", ARG_NON },
{ OP_BEGIN_POSITION, "begin-position", ARG_NON },
{ OP_BEGIN_POS_OR_LINE, "begin-pos-or-line", ARG_NON },
{ OP_BACKREF1, "backref1", ARG_NON },
{ OP_BACKREF2, "backref2", ARG_NON },
{ OP_BACKREFN, "backrefn", ARG_MEMNUM },
Expand Down Expand Up @@ -6629,7 +6595,6 @@ print_indent_tree(FILE* f, Node* node, int indent)
case ANCHOR_END_LINE: fputs("end line", f); break;
case ANCHOR_SEMI_END_BUF: fputs("semi end buf", f); break;
case ANCHOR_BEGIN_POSITION: fputs("begin position", f); break;
case ANCHOR_ANYCHAR_STAR: fputs("begin position/line", f); break;

case ANCHOR_WORD_BOUND: fputs("word bound", f); break;
case ANCHOR_NOT_WORD_BOUND: fputs("not word bound", f); break;
Expand Down
11 changes: 0 additions & 11 deletions regexec.c
Original file line number Diff line number Diff line change
Expand Up @@ -1484,7 +1484,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
&&L_OP_END_LINE,
&&L_OP_SEMI_END_BUF,
&&L_OP_BEGIN_POSITION,
&&L_OP_BEGIN_POS_OR_LINE, /* used for implicit anchor optimization */

&&L_OP_BACKREF1,
&&L_OP_BACKREF2,
Expand Down Expand Up @@ -2378,7 +2377,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
JUMP;

CASE(OP_BEGIN_LINE) MOP_IN(OP_BEGIN_LINE);
op_begin_line:
if (ON_STR_BEGIN(s)) {
if (IS_NOTBOL(msa->options)) goto fail;
MOP_OUT;
Expand Down Expand Up @@ -2454,13 +2452,6 @@ match_at(regex_t* reg, const UChar* str, const UChar* end,
MOP_OUT;
JUMP;

CASE(OP_BEGIN_POS_OR_LINE) MOP_IN(OP_BEGIN_POS_OR_LINE);
if (s != msa->gpos)
goto op_begin_line;

MOP_OUT;
JUMP;

CASE(OP_MEMORY_START_PUSH) MOP_IN(OP_MEMORY_START_PUSH);
GET_MEMNUM_INC(mem, p);
STACK_PUSH_MEM_START(mem, s);
Expand Down Expand Up @@ -4302,8 +4293,6 @@ onig_search_gpos(regex_t* reg, const UChar* str, const UChar* end,

if ((reg->anchor & ANCHOR_ANYCHAR_STAR) != 0) {
do {
if ((reg->anchor & ANCHOR_BEGIN_POSITION) == 0)
msa.gpos = s; /* move \G position */
MATCH_AND_RETURN_CHECK(orig_range);
prev = s;
s += enclen(reg->enc, s, end);
Expand Down
1 change: 0 additions & 1 deletion regint.h
Original file line number Diff line number Diff line change
Expand Up @@ -598,7 +598,6 @@ enum OpCode {
OP_END_LINE,
OP_SEMI_END_BUF,
OP_BEGIN_POSITION,
OP_BEGIN_POS_OR_LINE, /* used for implicit anchor optimization */

OP_BACKREF1,
OP_BACKREF2,
Expand Down
8 changes: 4 additions & 4 deletions testpy.py
Original file line number Diff line number Diff line change
Expand Up @@ -1564,6 +1564,8 @@ def main():
x2("(?m:.*abc)", "dddabdd\nddabc", 0, 13) # optimized /(?m:.*abc)/ ==> /\A(?m:.*abc)/
x2("(?m:.+abc)", "dddabdd\nddabc", 0, 13) # optimized
x2("(?-m:.*abc)", "dddabdd\nddabc", 8, 13) # optimized /(?-m:.*abc)/ ==> /(?:^|\A)(?m:.*abc)/
n("(?-m:.*ab[x-z])", "dddabdd\nddabc") # optimized
x2("(?-m:.*(?:abc|\\Gabc))", "dddabdd\nddabc", 8, 13) # optimized
x2("(?-m:.+abc)", "dddabdd\nddabc", 8, 13) # optimized
x2("(?-m:.*abc)", "dddabdd\nabc", 8, 11) # optimized
n("(?-m:.+abc)", "dddabdd\nabc") # optimized
Expand Down Expand Up @@ -1604,10 +1606,8 @@ def main():
x2("(?i)abc", "ABCABC", 3, 6, searchtype=SearchType.BACKWARD)
x2("[a-z]{3}$", "abcabc", 3, 6, searchtype=SearchType.BACKWARD)
x2("[あ-ん]{3}$", "あいうあいう", 3, 6, searchtype=SearchType.BACKWARD)

# These match differently. Is it okay?
x2(".*[a-z]bc", "abcabc", 0, 6, searchtype=SearchType.BACKWARD)
x2(".+[a-z]bc", "abcabc", 0, 6, searchtype=SearchType.BACKWARD)
x2(".*[a-z]bc", "abcabc", 3, 6, searchtype=SearchType.BACKWARD) # Issue #69
x2(".+[a-z]bc", "abcabc", 2, 6, searchtype=SearchType.BACKWARD) # Issue #69
x2(".{1,3}[a-z]bc", "abcabc", 2, 6, searchtype=SearchType.BACKWARD)

# onig_match()
Expand Down

0 comments on commit ebccac2

Please sign in to comment.