fix `regexp-match' performance for short matches on long strings

More specifically, for a string of length N and a match that
only looks at the first M characters, the complexity of
`regexp-match' is now O(M) instead of O(N). This allows
`regexp-split' to be O(N) for a string instead of O(N^2).

Also, fixed a bug in non-greedy matching that could affect
both long strings and input ports.
This commit is contained in:
Matthew Flatt 2011-08-18 08:44:02 -06:00
parent 09b4a55d87
commit 8eefaba187
7 changed files with 268 additions and 96 deletions

View File

@ -256,6 +256,28 @@
;; CL-PPCRE, which probably is from Perl originally.
;; The tests have been modified to avoid various incompatibilities.
(define (make-reluctant-port bstr)
;; Handing out a single character at a time stresses
;; the regexp matcher's lazy reading of a port:
(define pos 0)
(define len (bytes-length bstr))
(make-input-port
'reluctant-bytes
(lambda (s)
(if (pos . >= . len)
eof
(begin
(bytes-set! s 0 (bytes-ref bstr pos))
(set! pos (add1 pos))
1)))
(lambda (s skip evt)
(if ((+ pos skip) . >= . len)
eof
(begin
(bytes-set! s 0 (bytes-ref bstr (+ pos skip)))
1)))
void))
(map (lambda (t)
(if (pair? t)
(begin
@ -263,6 +285,7 @@
(test (caddr t) regexp-match (byte-pregexp (car t)) (bytes-append #"xxxxxxxxxx" (cadr t)) 10)
(test (caddr t) regexp-match (byte-pregexp (car t)) (bytes-append (cadr t) #"xxxxxxxxxx") 0 (bytes-length (cadr t)))
(test (caddr t) regexp-match (byte-pregexp (car t)) (open-input-bytes (cadr t)))
(test (caddr t) regexp-match (byte-pregexp (car t)) (make-reluctant-port (cadr t)))
(test (and (caddr t)
(map (lambda (v)
(and v (bytes->string/latin-1 v)))

View File

@ -289,4 +289,22 @@
(test "x y" string-join '("x" "y") " ")
(test "x" string-join '("x") " "))
;; String splitting can take longer than byte-string splitting,
;; but it should have the same computational complexity.
(let ()
(define N 100000)
(define-values (b bcpu breal bgc)
(time-apply
(lambda () (regexp-split #rx#"." (make-bytes N)))
null))
(define-values (s scpu sreal sgc)
(time-apply
(lambda () (regexp-split #rx"." (make-string N)))
null))
(test #f
'split
(and ((* 100 (- bcpu bgc)) . < . (- scpu sgc))
"suspiciously long time for regexp string split")))
(report-errs)

View File

@ -43,6 +43,7 @@ static int mark_regwork_MARK(void *p, struct NewGC *gc) {
gcMARK2(r->counters, gc);
gcMARK2(r->peekskip, gc);
gcMARK2(r->prefix, gc);
gcMARK2(r->lazy_string, gc);
gcMARK2(r->rewind_stack, gc);
return
gcBYTES_TO_WORDS(sizeof(Regwork));
@ -60,6 +61,7 @@ static int mark_regwork_FIXUP(void *p, struct NewGC *gc) {
gcFIXUP2(r->counters, gc);
gcFIXUP2(r->peekskip, gc);
gcFIXUP2(r->prefix, gc);
gcFIXUP2(r->lazy_string, gc);
gcFIXUP2(r->rewind_stack, gc);
return
gcBYTES_TO_WORDS(sizeof(Regwork));
@ -69,3 +71,28 @@ static int mark_regwork_FIXUP(void *p, struct NewGC *gc) {
#define mark_regwork_IS_CONST_SIZE 1
static int mark_lazy_string_SIZE(void *p, struct NewGC *gc) {
return
gcBYTES_TO_WORDS(sizeof(rx_lazy_str_t));
}
static int mark_lazy_string_MARK(void *p, struct NewGC *gc) {
rx_lazy_str_t *ls = (rx_lazy_str_t *)p;
gcMARK2(ls->s, gc);
gcMARK2(ls->chars, gc);
return
gcBYTES_TO_WORDS(sizeof(rx_lazy_str_t));
}
static int mark_lazy_string_FIXUP(void *p, struct NewGC *gc) {
rx_lazy_str_t *ls = (rx_lazy_str_t *)p;
gcFIXUP2(ls->s, gc);
gcFIXUP2(ls->chars, gc);
return
gcBYTES_TO_WORDS(sizeof(rx_lazy_str_t));
}
#define mark_lazy_string_IS_ATOMIC 0
#define mark_lazy_string_IS_CONST_SIZE 1

View File

@ -2177,11 +2177,21 @@ mark_regwork {
gcMARK2(r->counters, gc);
gcMARK2(r->peekskip, gc);
gcMARK2(r->prefix, gc);
gcMARK2(r->lazy_string, gc);
gcMARK2(r->rewind_stack, gc);
size:
gcBYTES_TO_WORDS(sizeof(Regwork));
}
mark_lazy_string {
mark:
rx_lazy_str_t *ls = (rx_lazy_str_t *)p;
gcMARK2(ls->s, gc);
gcMARK2(ls->chars, gc);
size:
gcBYTES_TO_WORDS(sizeof(rx_lazy_str_t));
}
END regexp;
/**********************************************************************/

View File

@ -2389,7 +2389,7 @@ static MZ_INLINE int in_ranges_ci(char *str, rxpos a, int l, int c)
/*
* Forwards.
*/
static int regtry(regexp *, char *, int, int, rxpos *, rxpos *, rxpos *, rxpos *, int *, Regwork *rw, rxpos,
static int regtry(regexp *, char *, int, int, rx_lazy_str_t *, rxpos *, rxpos *, rxpos *, rxpos *, int *, Regwork *rw, rxpos,
char *, rxpos, rxpos, int);
static int regtry_port(regexp *, Scheme_Object *, Scheme_Object *, int nonblock,
rxpos *, rxpos *, rxpos *, rxpos *, int *,
@ -2517,10 +2517,13 @@ static char *regprop();
static int
regexec(const char *who,
regexp *prog, char *string,
/* used only for strings: */
/* Used only for (bytes) strings: */
int stringpos, int stringlen, int stringorigin,
/* For lazy strings: */
rx_lazy_str_t *lazy_string,
/* Always used: */
rxpos *startp, rxpos *maybep, rxpos *endp, rxpos *match_stack,
/* For port mode: */
Scheme_Object *port, Scheme_Object *unless_evt, int nonblock,
/* Used only when port is non-NULL: */
char **stringp, int peek, int get_offsets, intptr_t save_prior,
@ -2539,7 +2542,7 @@ regexec(const char *who,
}
/* If there is a "must appear" string, look for it. */
if (!port && (prog->regmust >= 0)) {
if (!port && !lazy_string && (prog->regmust >= 0)) {
spos = stringpos;
while (1) {
int i, l = prog->regmlen, ch, pos;
@ -2704,7 +2707,7 @@ regexec(const char *who,
return 0;
}
} else
return regtry(prog, string, stringpos, stringlen, startp, maybep, endp,
return regtry(prog, string, stringpos, stringlen, lazy_string, startp, maybep, endp,
match_stack, counters, 0,
stringorigin, prefix, prefix_len, prefix_offset, 0);
}
@ -2781,7 +2784,7 @@ regexec(const char *who,
}
}
} else {
if (regtry(prog, string, stringpos, stringlen,
if (regtry(prog, string, stringpos, stringlen, lazy_string,
startp, maybep, endp, match_stack, counters,
0, stringorigin, prefix, prefix_len, prefix_offset, 1))
return 1;
@ -2791,11 +2794,14 @@ regexec(const char *who,
return 0;
}
#define NEED_INPUT(rw, v, n) if (rw->port && (((v) + (n)) > rw->input_end)) read_more_from_regport(rw, (v) + (n))
static void read_more_from_regport(Regwork *rw, rxpos need_total);
/*
- regtry - try match at specific point
*/
static int /* 0 failure, 1 success */
regtry(regexp *prog, char *string, int stringpos, int stringlen,
regtry(regexp *prog, char *string, int stringpos, int stringlen, rx_lazy_str_t *lazy_string,
rxpos *startp, rxpos *maybep, rxpos *endp, rxpos *match_stack, int *counters,
Regwork *rw, rxpos stringorigin,
char *prefix, rxpos prefix_len, rxpos prefix_offset,
@ -2833,6 +2839,9 @@ regtry(regexp *prog, char *string, int stringpos, int stringlen,
rw->non_tail = -1;
else
rw->non_tail = 0;
rw->lazy_string = lazy_string;
if (lazy_string)
rw->port = scheme_true; /* hack to make NEED_INPUT() work */
for (i = prog->nsubexp; i--; ) {
startp[i] = rw->input_min - 1;
@ -2855,6 +2864,10 @@ regtry(regexp *prog, char *string, int stringpos, int stringlen,
endp[0] = rw->input;
return 1;
} else if (unanchored) {
if (lazy_string) {
NEED_INPUT(rw, stringpos, 1);
stringlen = rw->input_end - stringpos;
}
if (!stringlen)
return 0;
stringpos++;
@ -2863,9 +2876,16 @@ regtry(regexp *prog, char *string, int stringpos, int stringlen,
unsigned char *rs = prog->regstart;
int c;
while (1) {
if (lazy_string) {
NEED_INPUT(rw, stringpos, 1);
stringlen = rw->input_end - stringpos;
string = rw->instr;
}
if (!stringlen)
return 0;
c = UCHAR(string[stringpos]);
c = UCHAR(string[stringpos]);
if (rs[c >> 3] & (1 << (c & 0x7)))
break;
stringpos++;
@ -2883,7 +2903,43 @@ regtry(regexp *prog, char *string, int stringpos, int stringlen,
}
}
#define NEED_INPUT(rw, v, n) if (rw->port && (((v) + (n)) > rw->input_end)) read_more_from_regport(rw, (v) + (n))
#define LAZY_STRING_CHUNK_SIZE 1024
static void read_more_from_lazy_string(Regwork *rw, rxpos need_total)
{
rx_lazy_str_t *ls = rw->lazy_string;
if (ls->start + ls->done < ls->end) {
intptr_t amt = ls->done, blen, tlen;
char *s;
amt = amt ? (2 * amt) : LAZY_STRING_CHUNK_SIZE;
if (ls->done + amt < need_total)
amt = need_total - ls->done;
if (ls->start + ls->done + amt > ls->end)
amt = ls->end - ls->start - ls->done;
blen = scheme_utf8_encode(ls->chars, ls->start + ls->done, ls->start + ls->done + amt,
NULL, 0,
0 /* not UTF-16 */);
tlen = blen + ls->blen;
s = (char *)scheme_malloc_atomic(tlen);
memcpy(s, ls->s, ls->blen);
scheme_utf8_encode(ls->chars, ls->start + ls->done, ls->start + ls->done + amt,
(unsigned char *)s, ls->blen,
0 /* not UTF-16 */);
ls->blen = tlen;
ls->s = s;
ls->done += amt;
rw->instr = s;
rw->input_end = tlen;
} else {
/* turn off further port reading */
rw->port = NULL;
}
}
static void read_more_from_regport(Regwork *rw, rxpos need_total)
/* Called when we're about to look past our read-ahead */
@ -2891,6 +2947,10 @@ static void read_more_from_regport(Regwork *rw, rxpos need_total)
intptr_t got;
Scheme_Object *peekskip;
if (rw->lazy_string) {
return read_more_from_lazy_string(rw, need_total);
}
/* limit reading by rw->input_maxend: */
if (need_total > rw->input_maxend) {
need_total = rw->input_maxend;
@ -3007,7 +3067,7 @@ regtry_port(regexp *prog, Scheme_Object *port, Scheme_Object *unless_evt, int no
rw.input_maxend = BIGGEST_RXPOS;
rw.peekskip = peekskip;
m = regtry(prog, *work_string, skip, (*len) - skip,
m = regtry(prog, *work_string, skip, (*len) - skip, NULL,
startp, maybep, endp, match_stack, counters,
&rw, origin, prefix, prefix_len, prefix_offset, 0);
@ -3359,10 +3419,10 @@ regmatch(Regwork *rw, rxpos prog)
rw->input = is;
if (nongreedy && rw->port) {
/* Get at least one, but then don't
let regrepeat pull in arbitrary code: */
/* Get at least `min' bytes, but then don't
let regrepeat pull in arbitrary bytes: */
Scheme_Object *saveport;
NEED_INPUT(rw, save, 1);
NEED_INPUT(rw, save, min ? min : 1);
saveport = rw->port;
rw->port = NULL;
no = regrepeat(rw, body, maxc);
@ -5110,6 +5170,7 @@ static Scheme_Object *gen_compare(char *name, int pos,
int offset = 0, orig_offset, endset, m, was_non_byte, last_bytes_count = last_bytes;
Scheme_Object *iport, *oport = NULL, *startv = NULL, *endv = NULL, *dropped, *unless_evt = NULL;
Scheme_Object *last_bytes_str = scheme_false, *srcin;
rx_lazy_str_t *lazy_string = NULL;
if (SCHEME_TYPE(argv[0]) != scheme_regexp_type
&& !SCHEME_BYTE_STRINGP(argv[0])
@ -5243,17 +5304,36 @@ static Scheme_Object *gen_compare(char *name, int pos,
full_s = SCHEME_BYTE_STR_VAL(srcin);
else {
/* Extract substring and UTF-8 encode: */
int blen;
blen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset,
NULL, 0,
0 /* not UTF-16 */);
full_s = (char *)scheme_malloc_atomic(blen);
scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset,
(unsigned char *)full_s, 0,
0 /* not UTF-16 */);
orig_offset = offset;
offset = 0;
endset = blen;
if (endset - offset < LAZY_STRING_CHUNK_SIZE) {
/* String is short enough to decode in one go: */
int blen;
blen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset,
NULL, 0,
0 /* not UTF-16 */);
full_s = (char *)scheme_malloc_atomic(blen);
scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset,
(unsigned char *)full_s, 0,
0 /* not UTF-16 */);
orig_offset = offset;
offset = 0;
endset = blen;
} else {
/* Handle extremely long strings by decoding lazily: */
lazy_string = MALLOC_ONE_RT(rx_lazy_str_t);
#ifdef MZTAG_REQUIRED
lazy_string->type = scheme_rt_rx_lazy_string;
#endif
lazy_string->start = offset;
lazy_string->end = endset;
lazy_string->done = 0;
lazy_string->blen = 0;
lazy_string->s = NULL;
lazy_string->chars = SCHEME_CHAR_STR_VAL(srcin);
full_s = NULL;
orig_offset = offset;
offset = 0;
endset = 0;
}
if (r->flags & REGEXP_IS_UTF8)
was_non_byte = 1;
else {
@ -5291,12 +5371,16 @@ static Scheme_Object *gen_compare(char *name, int pos,
dropped = scheme_make_integer(0);
m = regexec(name, r, full_s, offset, endset - offset, offset, startp, maybep, endp, match_stack,
m = regexec(name, r, full_s, offset, endset - offset, offset, lazy_string,
startp, maybep, endp, match_stack,
iport, unless_evt, nonblock,
&full_s, peek, pos, last_bytes_count, oport,
startv, endv, &dropped,
prefix, prefix_len, prefix_offset);
if (lazy_string)
full_s = lazy_string->s;
if (iport) {
minpos = -prefix_len;
offset = 0;
@ -5624,7 +5708,8 @@ static Scheme_Object *gen_replace(const char *name, int argc, Scheme_Object *arg
int m;
do {
m = regexec(name, r, source, srcoffset, sourcelen - srcoffset, 0, startp, maybep, endp, NULL,
m = regexec(name, r, source, srcoffset, sourcelen - srcoffset, 0, NULL,
startp, maybep, endp, NULL,
NULL, NULL, 0,
NULL, 0, 0, 0, NULL, NULL, NULL, NULL,
prefix, prefix_len, prefix_offset);
@ -5874,6 +5959,7 @@ void scheme_regexp_initialize(Scheme_Env *env)
#ifdef MZ_PRECISE_GC
GC_REG_TRAV(scheme_regexp_type, mark_regexp);
GC_REG_TRAV(scheme_rt_regwork, mark_regwork);
GC_REG_TRAV(scheme_rt_rx_lazy_string, mark_lazy_string);
#endif
REGISTER_SO(empty_byte_string);

View File

@ -230,7 +230,15 @@ typedef struct Regwork {
Scheme_Object *peekskip;
char *prefix;
rxpos prefix_len, prefix_delta;
struct rx_lazy_str_t *lazy_string;
int non_tail, rewind_stack_size, rewind_stack_count, rewind_stack_prompt;
rxpos *rewind_stack;
} Regwork;
typedef struct rx_lazy_str_t {
MZTAG_IF_REQUIRED
intptr_t start, done, end, blen;
mzchar *chars;
char *s;
} rx_lazy_str_t;

View File

@ -192,83 +192,83 @@ enum {
scheme_once_used_type, /* 172 */
scheme_serialized_symbol_type, /* 173 */
scheme_serialized_structure_type, /* 174 */
scheme_fsemaphore_type, /* 175 */
#ifdef MZTAG_REQUIRED
_scheme_last_normal_type_, /* 175 */
_scheme_last_normal_type_, /* 176 */
scheme_rt_weak_array, /* 176 */
scheme_rt_weak_array, /* 177 */
scheme_rt_comp_env, /* 177*/
scheme_rt_constant_binding, /* 178 */
scheme_rt_resolve_info, /* 179 */
scheme_rt_optimize_info, /* 180 */
scheme_rt_compile_info, /* 181 */
scheme_rt_cont_mark, /* 182 */
scheme_rt_saved_stack, /* 183 */
scheme_rt_reply_item, /* 184 */
scheme_rt_closure_info, /* 185 */
scheme_rt_overflow, /* 186 */
scheme_rt_overflow_jmp, /* 187 */
scheme_rt_meta_cont, /* 188 */
scheme_rt_dyn_wind_cell, /* 189 */
scheme_rt_dyn_wind_info, /* 190 */
scheme_rt_dyn_wind, /* 191 */
scheme_rt_dup_check, /* 192 */
scheme_rt_thread_memory, /* 193 */
scheme_rt_input_file, /* 194 */
scheme_rt_input_fd, /* 195 */
scheme_rt_oskit_console_input, /* 196 */
scheme_rt_tested_input_file, /* 197 */
scheme_rt_tested_output_file, /* 198 */
scheme_rt_indexed_string, /* 199 */
scheme_rt_output_file, /* 100 */
scheme_rt_load_handler_data, /* 201 */
scheme_rt_pipe, /* 202 */
scheme_rt_beos_process, /* 203 */
scheme_rt_system_child, /* 204 */
scheme_rt_tcp, /* 205 */
scheme_rt_write_data, /* 206 */
scheme_rt_tcp_select_info, /* 207 */
scheme_rt_param_data, /* 208 */
scheme_rt_will, /* 209 */
scheme_rt_struct_proc_info, /* 210 */
scheme_rt_linker_name, /* 211 */
scheme_rt_param_map, /* 212 */
scheme_rt_finalization, /* 213 */
scheme_rt_finalizations, /* 214 */
scheme_rt_cpp_object, /* 215 */
scheme_rt_cpp_array_object, /* 216 */
scheme_rt_stack_object, /* 217 */
scheme_rt_preallocated_object, /* 218 */
scheme_thread_hop_type, /* 219 */
scheme_rt_srcloc, /* 220 */
scheme_rt_evt, /* 221 */
scheme_rt_syncing, /* 222 */
scheme_rt_comp_prefix, /* 223 */
scheme_rt_user_input, /* 224 */
scheme_rt_user_output, /* 225 */
scheme_rt_compact_port, /* 226 */
scheme_rt_read_special_dw, /* 227 */
scheme_rt_regwork, /* 228 */
scheme_rt_buf_holder, /* 229 */
scheme_rt_parameterization, /* 230 */
scheme_rt_print_params, /* 231 */
scheme_rt_read_params, /* 232 */
scheme_rt_native_code, /* 233 */
scheme_rt_native_code_plus_case, /* 234 */
scheme_rt_jitter_data, /* 235 */
scheme_rt_module_exports, /* 236 */
scheme_rt_delay_load_info, /* 237 */
scheme_rt_marshal_info, /* 238 */
scheme_rt_unmarshal_info, /* 239 */
scheme_rt_runstack, /* 240 */
scheme_rt_sfs_info, /* 241 */
scheme_rt_validate_clearing, /* 242 */
scheme_rt_rb_node, /* 243 */
scheme_rt_lightweight_cont, /* 244 */
scheme_rt_constant_binding, /* 179 */
scheme_rt_resolve_info, /* 180 */
scheme_rt_optimize_info, /* 181 */
scheme_rt_compile_info, /* 182 */
scheme_rt_cont_mark, /* 183 */
scheme_rt_saved_stack, /* 184 */
scheme_rt_reply_item, /* 185 */
scheme_rt_closure_info, /* 186 */
scheme_rt_overflow, /* 187 */
scheme_rt_overflow_jmp, /* 188 */
scheme_rt_meta_cont, /* 189 */
scheme_rt_dyn_wind_cell, /* 190 */
scheme_rt_dyn_wind_info, /* 191 */
scheme_rt_dyn_wind, /* 192 */
scheme_rt_dup_check, /* 193 */
scheme_rt_thread_memory, /* 194 */
scheme_rt_input_file, /* 195 */
scheme_rt_input_fd, /* 196 */
scheme_rt_oskit_console_input, /* 197 */
scheme_rt_tested_input_file, /* 198 */
scheme_rt_tested_output_file, /* 199 */
scheme_rt_indexed_string, /* 200 */
scheme_rt_output_file, /* 201 */
scheme_rt_load_handler_data, /* 202 */
scheme_rt_pipe, /* 203 */
scheme_rt_beos_process, /* 204 */
scheme_rt_system_child, /* 205 */
scheme_rt_tcp, /* 206 */
scheme_rt_write_data, /* 207 */
scheme_rt_tcp_select_info, /* 208 */
scheme_rt_param_data, /* 209 */
scheme_rt_will, /* 210 */
scheme_rt_struct_proc_info, /* 211 */
scheme_rt_linker_name, /* 212 */
scheme_rt_param_map, /* 213 */
scheme_rt_finalization, /* 214 */
scheme_rt_finalizations, /* 215 */
scheme_rt_cpp_object, /* 216 */
scheme_rt_cpp_array_object, /* 217 */
scheme_rt_stack_object, /* 218 */
scheme_rt_preallocated_object, /* 219 */
scheme_thread_hop_type, /* 220 */
scheme_rt_srcloc, /* 221 */
scheme_rt_evt, /* 222 */
scheme_rt_syncing, /* 223 */
scheme_rt_comp_prefix, /* 224 */
scheme_rt_user_input, /* 225 */
scheme_rt_user_output, /* 226 */
scheme_rt_compact_port, /* 227 */
scheme_rt_read_special_dw, /* 228 */
scheme_rt_regwork, /* 229 */
scheme_rt_rx_lazy_string, /* 230 */
scheme_rt_buf_holder, /* 231 */
scheme_rt_parameterization, /* 232 */
scheme_rt_print_params, /* 233 */
scheme_rt_read_params, /* 234 */
scheme_rt_native_code, /* 235 */
scheme_rt_native_code_plus_case, /* 236 */
scheme_rt_jitter_data, /* 237 */
scheme_rt_module_exports, /* 238 */
scheme_rt_delay_load_info, /* 239 */
scheme_rt_marshal_info, /* 240 */
scheme_rt_unmarshal_info, /* 241 */
scheme_rt_runstack, /* 242 */
scheme_rt_sfs_info, /* 243 */
scheme_rt_validate_clearing, /* 244 */
scheme_rt_rb_node, /* 245 */
scheme_rt_lightweight_cont, /* 246 */
#endif
scheme_fsemaphore_type, /* 245 */
_scheme_last_type_
};