allow paths as input to regexp-matching functions

This commit is contained in:
Matthew Flatt 2010-05-11 06:03:39 -06:00
parent 745adb03d1
commit acb98cef60
3 changed files with 66 additions and 34 deletions

View File

@ -126,7 +126,12 @@
success-choose failure-k
port-success-k port-success-choose port-failure-k
need-leftover? peek?)
(let* ([len (cond [(string? string) (string-length string)]
(let* ([string (if (path? string)
(if (or (string? pattern) (regexp? pattern))
(path->string string)
(path->bytes string))
string)]
[len (cond [(string? string) (string-length string)]
[(bytes? string) (bytes-length string)]
[else #f])]
[orig-rx (cond [(bytes? pattern) (byte-regexp pattern)]
@ -143,7 +148,7 @@
(raise-type-error 'name "input port" string))
(unless (or len (input-port? string))
(raise-type-error
'name "string, byte string or input port" string)))
'name "string, byte string, path, or input port" string)))
(unless (and (number? start) (exact? start) (integer? start)
(start . >= . 0))
(raise-type-error 'name "non-negative exact integer" start))
@ -283,7 +288,11 @@
(bytes? pattern)))
(string->bytes/utf-8 string (char->integer #\?))
string))
(define sub (if (bytes? buf) subbytes substring))
(define sub (if (or (bytes? buf) (and (path? string)
(or (bytes? pattern)
(byte-regexp? pattern))))
subbytes
substring))
(regexp-loop regexp-split loop start end pattern buf ipre
;; success-choose:
(lambda (start mstart mend ms acc) (cons (sub buf start mstart) acc))
@ -418,7 +427,11 @@
(bytes? pattern)))
(string->bytes/utf-8 string (char->integer #\?))
string))
(define sub (if (bytes? buf) subbytes substring))
(define sub (if (or (bytes? buf) (and (path? string)
(or (bytes? pattern)
(byte-regexp? pattern))))
subbytes
substring))
(regexp-loop regexp-match* loop start end pattern buf ipre
;; success-choose:
(lambda (start mstart mend ms acc) (cons (sub buf mstart mend) acc))

View File

@ -210,20 +210,26 @@ the start of the input or of a line.}
@section{Regexp Matching}
@defproc[(regexp-match [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]
[input (or/c string? bytes? path? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f]
[input-prefix bytes? #""])
(if (and (or (string? pattern) (regexp? pattern))
(string? input))
(or (string? input) (path? input)))
(or/c #f (cons/c string? (listof (or/c string? #f))))
(or/c #f (cons/c bytes? (listof (or/c bytes? #f)))))]{
Attempts to match @racket[pattern] (a string, byte string, @tech{regexp
value}, or byte-@tech{regexp value}) once to a portion of @racket[input]. The
matcher finds a portion of @racket[input] that matches and is closest
to the start of the input (after @racket[start-pos]).
Attempts to match @racket[pattern] (a string, byte string,
@tech{regexp value}, or byte-@tech{regexp value}) once to a portion of
@racket[input]. The matcher finds a portion of @racket[input] that
matches and is closest to the start of the input (after
@racket[start-pos]).
If @racket[input] is a path, it is converted to a byte string with
@racket[path->bytes] if @racket[pattern] is a byte string or a
byte-based regexp. Otherwise, @racket[input] is converted to a string
with @racket[path->string].
The optional @racket[start-pos] and @racket[end-pos] arguments select
a portion of @racket[input] for matching; the default is the entire
@ -320,12 +326,12 @@ bytes. To avoid such interleaving, use @racket[regexp-match-peek]
@defproc[(regexp-match* [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]
[input (or/c string? bytes? path? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""])
(if (and (or (string? pattern) (regexp? pattern))
(string? input))
(or (string? input) (path? input)))
(listof string?)
(listof bytes?))]{
@ -381,7 +387,7 @@ fails.}
@defproc[(regexp-match-positions [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]
[input (or/c string? bytes? path? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f]
@ -414,7 +420,7 @@ positions indicate the number of bytes that were read, including
]}
@defproc[(regexp-match-positions* [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]
[input (or/c string? bytes? path? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""])
@ -430,7 +436,7 @@ like @racket[regexp-match*].
@defproc[(regexp-match? [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]
[input (or/c string? bytes? path? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f]
@ -447,7 +453,7 @@ match succeeds, @racket[#f] otherwise.
@defproc[(regexp-match-exact? [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)])
[input (or/c string? bytes? path? input-port?)])
boolean?]{
Like @racket[regexp-match?], but @racket[#t] is only returned when the
@ -553,7 +559,7 @@ Like @racket[regexp-match-peek-positions], but returns multiple matches like
@racket[regexp-match*].}
@defproc[(regexp-match/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]
[input (or/c string? bytes? path? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[output-port (or/c output-port? #f) #f]
@ -561,7 +567,7 @@ Like @racket[regexp-match-peek-positions], but returns multiple matches like
[count nonnegative-exact-integer? 1])
(values
(if (and (or (string? pattern) (regexp? pattern))
(string? input))
(or/c (string? input) (path? input)))
(or/c #f (cons/c string? (listof (or/c string? #f))))
(or/c #f (cons/c bytes? (listof (or/c bytes? #f)))))
(or/c #f bytes?))]{
@ -578,7 +584,7 @@ to determine an appropriate value for @racket[count].}
@deftogether[(
@defproc[(regexp-match-positions/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
[input (or/c string? bytes? input-port?)]
[input (or/c string? bytes? path? input-port?)]
[start-pos exact-nonnegative-integer? 0]
[end-pos (or/c exact-nonnegative-integer? #f) #f]
[input-prefix bytes? #""]

View File

@ -5000,25 +5000,38 @@ static Scheme_Object *gen_compare(char *name, int pos,
rxpos *startp, *maybep, *endp, prefix_len = 0, prefix_offset = 0, minpos;
int offset = 0, orig_offset, endset, m, was_non_byte, last_bytes_count = last_bytes;
Scheme_Object *iport, *oport = NULL, *startv = NULL, *endv = NULL, *dropped, *unless_evt = NULL;
Scheme_Object *last_bytes_str = scheme_false;
Scheme_Object *last_bytes_str = scheme_false, *srcin;
if (SCHEME_TYPE(argv[0]) != scheme_regexp_type
&& !SCHEME_BYTE_STRINGP(argv[0])
&& !SCHEME_CHAR_STRINGP(argv[0]))
scheme_wrong_type(name, "regexp, byte-regexp, string, or byte string", 0, argc, argv);
if ((peek || (!SCHEME_BYTE_STRINGP(argv[1]) && !SCHEME_CHAR_STRINGP(argv[1])))
&& !SCHEME_INPUT_PORTP(argv[1]))
scheme_wrong_type(name, peek ? "input-port" : "string, byte string, or input port", 1, argc, argv);
&& !SCHEME_INPUT_PORTP(argv[1])
&& !SCHEME_PATHP(argv[1]))
scheme_wrong_type(name, peek ? "input-port" : "string, byte string, path, or input port", 1, argc, argv);
if (SCHEME_CHAR_STRINGP(argv[1])) {
srcin = argv[1];
if (SCHEME_PATHP(srcin)) {
if (SCHEME_BYTE_STRINGP(argv[0])
|| (SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_regexp_type)
&& !(((regexp *)argv[0])->flags & REGEXP_IS_UTF8)))
srcin = scheme_make_sized_byte_string(SCHEME_PATH_VAL(srcin),
SCHEME_PATH_LEN(srcin),
1);
else
srcin = scheme_path_to_char_string(srcin);
}
if (SCHEME_CHAR_STRINGP(srcin)) {
iport = NULL;
endset = SCHEME_CHAR_STRLEN_VAL(argv[1]);
} else if (SCHEME_INPUT_PORTP(argv[1])) {
iport = argv[1];
endset = SCHEME_CHAR_STRLEN_VAL(srcin);
} else if (SCHEME_INPUT_PORTP(srcin)) {
iport = srcin;
endset = -2;
} else {
iport = NULL;
endset = SCHEME_BYTE_STRLEN_VAL(argv[1]);
endset = SCHEME_BYTE_STRLEN_VAL(srcin);
}
if (argc > 2) {
@ -5027,7 +5040,7 @@ static Scheme_Object *gen_compare(char *name, int pos,
offset = scheme_extract_index(name, 2, argc, argv, len + 1, 0);
if (!iport && (offset > len)) {
scheme_out_of_string_range(name, "offset ", argv[2], argv[1], 0, len);
scheme_out_of_string_range(name, "offset ", argv[2], srcin, 0, len);
return NULL;
} else if (offset < 0) {
/* argument was a bignum */
@ -5052,7 +5065,7 @@ static Scheme_Object *gen_compare(char *name, int pos,
return NULL;
}
} else if (endset < offset || endset > len) {
scheme_out_of_string_range(name, "ending ", argv[3], argv[1], offset, len);
scheme_out_of_string_range(name, "ending ", argv[3], srcin, offset, len);
return NULL;
}
endv = argv[3];
@ -5117,16 +5130,16 @@ static Scheme_Object *gen_compare(char *name, int pos,
was_non_byte = 0;
orig_offset = 0; /* extra offset */
if (!iport) {
if (SCHEME_BYTE_STRINGP(argv[1]))
full_s = SCHEME_BYTE_STR_VAL(argv[1]);
if (SCHEME_BYTE_STRINGP(srcin))
full_s = SCHEME_BYTE_STR_VAL(srcin);
else {
/* Extract substring and UTF-8 encode: */
int blen;
blen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), offset, endset,
blen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset,
NULL, 0,
0 /* not UTF-16 */);
full_s = (char *)scheme_malloc_atomic(blen);
scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), offset, endset,
scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset,
(unsigned char *)full_s, 0,
0 /* not UTF-16 */);
orig_offset = offset;
@ -5136,7 +5149,7 @@ static Scheme_Object *gen_compare(char *name, int pos,
was_non_byte = 1;
else {
/* Convert orig_offset into encoded bytes */
orig_offset = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), 0, orig_offset,
orig_offset = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), 0, orig_offset,
NULL, 0,
0);
}