allow paths as input to regexp-matching functions

This commit is contained in:
Matthew Flatt 2010-05-11 06:03:39 -06:00
parent 745adb03d1
commit acb98cef60
3 changed files with 66 additions and 34 deletions

View File

@ -126,7 +126,12 @@
success-choose failure-k success-choose failure-k
port-success-k port-success-choose port-failure-k port-success-k port-success-choose port-failure-k
need-leftover? peek?) need-leftover? peek?)
(let* ([len (cond [(string? string) (string-length string)] (let* ([string (if (path? string)
(if (or (string? pattern) (regexp? pattern))
(path->string string)
(path->bytes string))
string)]
[len (cond [(string? string) (string-length string)]
[(bytes? string) (bytes-length string)] [(bytes? string) (bytes-length string)]
[else #f])] [else #f])]
[orig-rx (cond [(bytes? pattern) (byte-regexp pattern)] [orig-rx (cond [(bytes? pattern) (byte-regexp pattern)]
@ -143,7 +148,7 @@
(raise-type-error 'name "input port" string)) (raise-type-error 'name "input port" string))
(unless (or len (input-port? string)) (unless (or len (input-port? string))
(raise-type-error (raise-type-error
'name "string, byte string or input port" string))) 'name "string, byte string, path, or input port" string)))
(unless (and (number? start) (exact? start) (integer? start) (unless (and (number? start) (exact? start) (integer? start)
(start . >= . 0)) (start . >= . 0))
(raise-type-error 'name "non-negative exact integer" start)) (raise-type-error 'name "non-negative exact integer" start))
@ -283,7 +288,11 @@
(bytes? pattern))) (bytes? pattern)))
(string->bytes/utf-8 string (char->integer #\?)) (string->bytes/utf-8 string (char->integer #\?))
string)) string))
(define sub (if (bytes? buf) subbytes substring)) (define sub (if (or (bytes? buf) (and (path? string)
(or (bytes? pattern)
(byte-regexp? pattern))))
subbytes
substring))
(regexp-loop regexp-split loop start end pattern buf ipre (regexp-loop regexp-split loop start end pattern buf ipre
;; success-choose: ;; success-choose:
(lambda (start mstart mend ms acc) (cons (sub buf start mstart) acc)) (lambda (start mstart mend ms acc) (cons (sub buf start mstart) acc))
@ -418,7 +427,11 @@
(bytes? pattern))) (bytes? pattern)))
(string->bytes/utf-8 string (char->integer #\?)) (string->bytes/utf-8 string (char->integer #\?))
string)) string))
(define sub (if (bytes? buf) subbytes substring)) (define sub (if (or (bytes? buf) (and (path? string)
(or (bytes? pattern)
(byte-regexp? pattern))))
subbytes
substring))
(regexp-loop regexp-match* loop start end pattern buf ipre (regexp-loop regexp-match* loop start end pattern buf ipre
;; success-choose: ;; success-choose:
(lambda (start mstart mend ms acc) (cons (sub buf mstart mend) acc)) (lambda (start mstart mend ms acc) (cons (sub buf mstart mend) acc))

View File

@ -210,20 +210,26 @@ the start of the input or of a line.}
@section{Regexp Matching} @section{Regexp Matching}
@defproc[(regexp-match [pattern (or/c string? bytes? regexp? byte-regexp?)] @defproc[(regexp-match [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)] [input (or/c string? bytes? path? input-port?)]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f] [output-port (or/c output-port? #f) #f]
[input-prefix bytes? #""]) [input-prefix bytes? #""])
(if (and (or (string? pattern) (regexp? pattern)) (if (and (or (string? pattern) (regexp? pattern))
(string? input)) (or (string? input) (path? input)))
(or/c #f (cons/c string? (listof (or/c string? #f)))) (or/c #f (cons/c string? (listof (or/c string? #f))))
(or/c #f (cons/c bytes? (listof (or/c bytes? #f)))))]{ (or/c #f (cons/c bytes? (listof (or/c bytes? #f)))))]{
Attempts to match @racket[pattern] (a string, byte string, @tech{regexp Attempts to match @racket[pattern] (a string, byte string,
value}, or byte-@tech{regexp value}) once to a portion of @racket[input]. The @tech{regexp value}, or byte-@tech{regexp value}) once to a portion of
matcher finds a portion of @racket[input] that matches and is closest @racket[input]. The matcher finds a portion of @racket[input] that
to the start of the input (after @racket[start-pos]). matches and is closest to the start of the input (after
@racket[start-pos]).
If @racket[input] is a path, it is converted to a byte string with
@racket[path->bytes] if @racket[pattern] is a byte string or a
byte-based regexp. Otherwise, @racket[input] is converted to a string
with @racket[path->string].
The optional @racket[start-pos] and @racket[end-pos] arguments select The optional @racket[start-pos] and @racket[end-pos] arguments select
a portion of @racket[input] for matching; the default is the entire a portion of @racket[input] for matching; the default is the entire
@ -320,12 +326,12 @@ bytes. To avoid such interleaving, use @racket[regexp-match-peek]
@defproc[(regexp-match* [pattern (or/c string? bytes? regexp? byte-regexp?)] @defproc[(regexp-match* [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)] [input (or/c string? bytes? path? input-port?)]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""]) [input-prefix bytes? #""])
(if (and (or (string? pattern) (regexp? pattern)) (if (and (or (string? pattern) (regexp? pattern))
(string? input)) (or (string? input) (path? input)))
(listof string?) (listof string?)
(listof bytes?))]{ (listof bytes?))]{
@ -381,7 +387,7 @@ fails.}
@defproc[(regexp-match-positions [pattern (or/c string? bytes? regexp? byte-regexp?)] @defproc[(regexp-match-positions [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)] [input (or/c string? bytes? path? input-port?)]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f] [output-port (or/c output-port? #f) #f]
@ -414,7 +420,7 @@ positions indicate the number of bytes that were read, including
]} ]}
@defproc[(regexp-match-positions* [pattern (or/c string? bytes? regexp? byte-regexp?)] @defproc[(regexp-match-positions* [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)] [input (or/c string? bytes? path? input-port?)]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""]) [input-prefix bytes? #""])
@ -430,7 +436,7 @@ like @racket[regexp-match*].
@defproc[(regexp-match? [pattern (or/c string? bytes? regexp? byte-regexp?)] @defproc[(regexp-match? [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)] [input (or/c string? bytes? path? input-port?)]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f] [output-port (or/c output-port? #f) #f]
@ -447,7 +453,7 @@ match succeeds, @racket[#f] otherwise.
@defproc[(regexp-match-exact? [pattern (or/c string? bytes? regexp? byte-regexp?)] @defproc[(regexp-match-exact? [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]) [input (or/c string? bytes? path? input-port?)])
boolean?]{ boolean?]{
Like @racket[regexp-match?], but @racket[#t] is only returned when the Like @racket[regexp-match?], but @racket[#t] is only returned when the
@ -553,7 +559,7 @@ Like @racket[regexp-match-peek-positions], but returns multiple matches like
@racket[regexp-match*].} @racket[regexp-match*].}
@defproc[(regexp-match/end [pattern (or/c string? bytes? regexp? byte-regexp?)] @defproc[(regexp-match/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)] [input (or/c string? bytes? path? input-port?)]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f] [output-port (or/c output-port? #f) #f]
@ -561,7 +567,7 @@ Like @racket[regexp-match-peek-positions], but returns multiple matches like
[count nonnegative-exact-integer? 1]) [count nonnegative-exact-integer? 1])
(values (values
(if (and (or (string? pattern) (regexp? pattern)) (if (and (or (string? pattern) (regexp? pattern))
(string? input)) (or/c (string? input) (path? input)))
(or/c #f (cons/c string? (listof (or/c string? #f)))) (or/c #f (cons/c string? (listof (or/c string? #f))))
(or/c #f (cons/c bytes? (listof (or/c bytes? #f))))) (or/c #f (cons/c bytes? (listof (or/c bytes? #f)))))
(or/c #f bytes?))]{ (or/c #f bytes?))]{
@ -578,7 +584,7 @@ to determine an appropriate value for @racket[count].}
@deftogether[( @deftogether[(
@defproc[(regexp-match-positions/end [pattern (or/c string? bytes? regexp? byte-regexp?)] @defproc[(regexp-match-positions/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)] [input (or/c string? bytes? path? input-port?)]
[start-pos exact-nonnegative-integer? 0] [start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f] [end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""] [input-prefix bytes? #""]

View File

@ -5000,25 +5000,38 @@ static Scheme_Object *gen_compare(char *name, int pos,
rxpos *startp, *maybep, *endp, prefix_len = 0, prefix_offset = 0, minpos; rxpos *startp, *maybep, *endp, prefix_len = 0, prefix_offset = 0, minpos;
int offset = 0, orig_offset, endset, m, was_non_byte, last_bytes_count = last_bytes; int offset = 0, orig_offset, endset, m, was_non_byte, last_bytes_count = last_bytes;
Scheme_Object *iport, *oport = NULL, *startv = NULL, *endv = NULL, *dropped, *unless_evt = NULL; Scheme_Object *iport, *oport = NULL, *startv = NULL, *endv = NULL, *dropped, *unless_evt = NULL;
Scheme_Object *last_bytes_str = scheme_false; Scheme_Object *last_bytes_str = scheme_false, *srcin;
if (SCHEME_TYPE(argv[0]) != scheme_regexp_type if (SCHEME_TYPE(argv[0]) != scheme_regexp_type
&& !SCHEME_BYTE_STRINGP(argv[0]) && !SCHEME_BYTE_STRINGP(argv[0])
&& !SCHEME_CHAR_STRINGP(argv[0])) && !SCHEME_CHAR_STRINGP(argv[0]))
scheme_wrong_type(name, "regexp, byte-regexp, string, or byte string", 0, argc, argv); scheme_wrong_type(name, "regexp, byte-regexp, string, or byte string", 0, argc, argv);
if ((peek || (!SCHEME_BYTE_STRINGP(argv[1]) && !SCHEME_CHAR_STRINGP(argv[1]))) if ((peek || (!SCHEME_BYTE_STRINGP(argv[1]) && !SCHEME_CHAR_STRINGP(argv[1])))
&& !SCHEME_INPUT_PORTP(argv[1])) && !SCHEME_INPUT_PORTP(argv[1])
scheme_wrong_type(name, peek ? "input-port" : "string, byte string, or input port", 1, argc, argv); && !SCHEME_PATHP(argv[1]))
scheme_wrong_type(name, peek ? "input-port" : "string, byte string, path, or input port", 1, argc, argv);
if (SCHEME_CHAR_STRINGP(argv[1])) { srcin = argv[1];
if (SCHEME_PATHP(srcin)) {
if (SCHEME_BYTE_STRINGP(argv[0])
|| (SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_regexp_type)
&& !(((regexp *)argv[0])->flags & REGEXP_IS_UTF8)))
srcin = scheme_make_sized_byte_string(SCHEME_PATH_VAL(srcin),
SCHEME_PATH_LEN(srcin),
1);
else
srcin = scheme_path_to_char_string(srcin);
}
if (SCHEME_CHAR_STRINGP(srcin)) {
iport = NULL; iport = NULL;
endset = SCHEME_CHAR_STRLEN_VAL(argv[1]); endset = SCHEME_CHAR_STRLEN_VAL(srcin);
} else if (SCHEME_INPUT_PORTP(argv[1])) { } else if (SCHEME_INPUT_PORTP(srcin)) {
iport = argv[1]; iport = srcin;
endset = -2; endset = -2;
} else { } else {
iport = NULL; iport = NULL;
endset = SCHEME_BYTE_STRLEN_VAL(argv[1]); endset = SCHEME_BYTE_STRLEN_VAL(srcin);
} }
if (argc > 2) { if (argc > 2) {
@ -5027,7 +5040,7 @@ static Scheme_Object *gen_compare(char *name, int pos,
offset = scheme_extract_index(name, 2, argc, argv, len + 1, 0); offset = scheme_extract_index(name, 2, argc, argv, len + 1, 0);
if (!iport && (offset > len)) { if (!iport && (offset > len)) {
scheme_out_of_string_range(name, "offset ", argv[2], argv[1], 0, len); scheme_out_of_string_range(name, "offset ", argv[2], srcin, 0, len);
return NULL; return NULL;
} else if (offset < 0) { } else if (offset < 0) {
/* argument was a bignum */ /* argument was a bignum */
@ -5052,7 +5065,7 @@ static Scheme_Object *gen_compare(char *name, int pos,
return NULL; return NULL;
} }
} else if (endset < offset || endset > len) { } else if (endset < offset || endset > len) {
scheme_out_of_string_range(name, "ending ", argv[3], argv[1], offset, len); scheme_out_of_string_range(name, "ending ", argv[3], srcin, offset, len);
return NULL; return NULL;
} }
endv = argv[3]; endv = argv[3];
@ -5117,16 +5130,16 @@ static Scheme_Object *gen_compare(char *name, int pos,
was_non_byte = 0; was_non_byte = 0;
orig_offset = 0; /* extra offset */ orig_offset = 0; /* extra offset */
if (!iport) { if (!iport) {
if (SCHEME_BYTE_STRINGP(argv[1])) if (SCHEME_BYTE_STRINGP(srcin))
full_s = SCHEME_BYTE_STR_VAL(argv[1]); full_s = SCHEME_BYTE_STR_VAL(srcin);
else { else {
/* Extract substring and UTF-8 encode: */ /* Extract substring and UTF-8 encode: */
int blen; int blen;
blen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), offset, endset, blen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset,
NULL, 0, NULL, 0,
0 /* not UTF-16 */); 0 /* not UTF-16 */);
full_s = (char *)scheme_malloc_atomic(blen); full_s = (char *)scheme_malloc_atomic(blen);
scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), offset, endset, scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset,
(unsigned char *)full_s, 0, (unsigned char *)full_s, 0,
0 /* not UTF-16 */); 0 /* not UTF-16 */);
orig_offset = offset; orig_offset = offset;
@ -5136,7 +5149,7 @@ static Scheme_Object *gen_compare(char *name, int pos,
was_non_byte = 1; was_non_byte = 1;
else { else {
/* Convert orig_offset into encoded bytes */ /* Convert orig_offset into encoded bytes */
orig_offset = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), 0, orig_offset, orig_offset = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), 0, orig_offset,
NULL, 0, NULL, 0,
0); 0);
} }