allow paths as input to regexp-matching functions
This commit is contained in:
parent
745adb03d1
commit
acb98cef60
|
@ -126,7 +126,12 @@
|
|||
success-choose failure-k
|
||||
port-success-k port-success-choose port-failure-k
|
||||
need-leftover? peek?)
|
||||
(let* ([len (cond [(string? string) (string-length string)]
|
||||
(let* ([string (if (path? string)
|
||||
(if (or (string? pattern) (regexp? pattern))
|
||||
(path->string string)
|
||||
(path->bytes string))
|
||||
string)]
|
||||
[len (cond [(string? string) (string-length string)]
|
||||
[(bytes? string) (bytes-length string)]
|
||||
[else #f])]
|
||||
[orig-rx (cond [(bytes? pattern) (byte-regexp pattern)]
|
||||
|
@ -143,7 +148,7 @@
|
|||
(raise-type-error 'name "input port" string))
|
||||
(unless (or len (input-port? string))
|
||||
(raise-type-error
|
||||
'name "string, byte string or input port" string)))
|
||||
'name "string, byte string, path, or input port" string)))
|
||||
(unless (and (number? start) (exact? start) (integer? start)
|
||||
(start . >= . 0))
|
||||
(raise-type-error 'name "non-negative exact integer" start))
|
||||
|
@ -283,7 +288,11 @@
|
|||
(bytes? pattern)))
|
||||
(string->bytes/utf-8 string (char->integer #\?))
|
||||
string))
|
||||
(define sub (if (bytes? buf) subbytes substring))
|
||||
(define sub (if (or (bytes? buf) (and (path? string)
|
||||
(or (bytes? pattern)
|
||||
(byte-regexp? pattern))))
|
||||
subbytes
|
||||
substring))
|
||||
(regexp-loop regexp-split loop start end pattern buf ipre
|
||||
;; success-choose:
|
||||
(lambda (start mstart mend ms acc) (cons (sub buf start mstart) acc))
|
||||
|
@ -418,7 +427,11 @@
|
|||
(bytes? pattern)))
|
||||
(string->bytes/utf-8 string (char->integer #\?))
|
||||
string))
|
||||
(define sub (if (bytes? buf) subbytes substring))
|
||||
(define sub (if (or (bytes? buf) (and (path? string)
|
||||
(or (bytes? pattern)
|
||||
(byte-regexp? pattern))))
|
||||
subbytes
|
||||
substring))
|
||||
(regexp-loop regexp-match* loop start end pattern buf ipre
|
||||
;; success-choose:
|
||||
(lambda (start mstart mend ms acc) (cons (sub buf mstart mend) acc))
|
||||
|
|
|
@ -210,20 +210,26 @@ the start of the input or of a line.}
|
|||
@section{Regexp Matching}
|
||||
|
||||
@defproc[(regexp-match [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input (or/c string? bytes? input-port?)]
|
||||
[input (or/c string? bytes? path? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[output-port (or/c output-port? #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
(if (and (or (string? pattern) (regexp? pattern))
|
||||
(string? input))
|
||||
(or (string? input) (path? input)))
|
||||
(or/c #f (cons/c string? (listof (or/c string? #f))))
|
||||
(or/c #f (cons/c bytes? (listof (or/c bytes? #f)))))]{
|
||||
|
||||
Attempts to match @racket[pattern] (a string, byte string, @tech{regexp
|
||||
value}, or byte-@tech{regexp value}) once to a portion of @racket[input]. The
|
||||
matcher finds a portion of @racket[input] that matches and is closest
|
||||
to the start of the input (after @racket[start-pos]).
|
||||
Attempts to match @racket[pattern] (a string, byte string,
|
||||
@tech{regexp value}, or byte-@tech{regexp value}) once to a portion of
|
||||
@racket[input]. The matcher finds a portion of @racket[input] that
|
||||
matches and is closest to the start of the input (after
|
||||
@racket[start-pos]).
|
||||
|
||||
If @racket[input] is a path, it is converted to a byte string with
|
||||
@racket[path->bytes] if @racket[pattern] is a byte string or a
|
||||
byte-based regexp. Otherwise, @racket[input] is converted to a string
|
||||
with @racket[path->string].
|
||||
|
||||
The optional @racket[start-pos] and @racket[end-pos] arguments select
|
||||
a portion of @racket[input] for matching; the default is the entire
|
||||
|
@ -320,12 +326,12 @@ bytes. To avoid such interleaving, use @racket[regexp-match-peek]
|
|||
|
||||
|
||||
@defproc[(regexp-match* [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input (or/c string? bytes? input-port?)]
|
||||
[input (or/c string? bytes? path? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
(if (and (or (string? pattern) (regexp? pattern))
|
||||
(string? input))
|
||||
(or (string? input) (path? input)))
|
||||
(listof string?)
|
||||
(listof bytes?))]{
|
||||
|
||||
|
@ -381,7 +387,7 @@ fails.}
|
|||
|
||||
|
||||
@defproc[(regexp-match-positions [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input (or/c string? bytes? input-port?)]
|
||||
[input (or/c string? bytes? path? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[output-port (or/c output-port? #f) #f]
|
||||
|
@ -414,7 +420,7 @@ positions indicate the number of bytes that were read, including
|
|||
]}
|
||||
|
||||
@defproc[(regexp-match-positions* [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input (or/c string? bytes? input-port?)]
|
||||
[input (or/c string? bytes? path? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[input-prefix bytes? #""])
|
||||
|
@ -430,7 +436,7 @@ like @racket[regexp-match*].
|
|||
|
||||
|
||||
@defproc[(regexp-match? [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input (or/c string? bytes? input-port?)]
|
||||
[input (or/c string? bytes? path? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[output-port (or/c output-port? #f) #f]
|
||||
|
@ -447,7 +453,7 @@ match succeeds, @racket[#f] otherwise.
|
|||
|
||||
|
||||
@defproc[(regexp-match-exact? [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input (or/c string? bytes? input-port?)])
|
||||
[input (or/c string? bytes? path? input-port?)])
|
||||
boolean?]{
|
||||
|
||||
Like @racket[regexp-match?], but @racket[#t] is only returned when the
|
||||
|
@ -553,7 +559,7 @@ Like @racket[regexp-match-peek-positions], but returns multiple matches like
|
|||
@racket[regexp-match*].}
|
||||
|
||||
@defproc[(regexp-match/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input (or/c string? bytes? input-port?)]
|
||||
[input (or/c string? bytes? path? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[output-port (or/c output-port? #f) #f]
|
||||
|
@ -561,7 +567,7 @@ Like @racket[regexp-match-peek-positions], but returns multiple matches like
|
|||
[count nonnegative-exact-integer? 1])
|
||||
(values
|
||||
(if (and (or (string? pattern) (regexp? pattern))
|
||||
(string? input))
|
||||
(or/c (string? input) (path? input)))
|
||||
(or/c #f (cons/c string? (listof (or/c string? #f))))
|
||||
(or/c #f (cons/c bytes? (listof (or/c bytes? #f)))))
|
||||
(or/c #f bytes?))]{
|
||||
|
@ -578,7 +584,7 @@ to determine an appropriate value for @racket[count].}
|
|||
|
||||
@deftogether[(
|
||||
@defproc[(regexp-match-positions/end [pattern (or/c string? bytes? regexp? byte-regexp?)]
|
||||
[input (or/c string? bytes? input-port?)]
|
||||
[input (or/c string? bytes? path? input-port?)]
|
||||
[start-pos exact-nonnegative-integer? 0]
|
||||
[end-pos (or/c exact-nonnegative-integer? #f) #f]
|
||||
[input-prefix bytes? #""]
|
||||
|
|
|
@ -5000,25 +5000,38 @@ static Scheme_Object *gen_compare(char *name, int pos,
|
|||
rxpos *startp, *maybep, *endp, prefix_len = 0, prefix_offset = 0, minpos;
|
||||
int offset = 0, orig_offset, endset, m, was_non_byte, last_bytes_count = last_bytes;
|
||||
Scheme_Object *iport, *oport = NULL, *startv = NULL, *endv = NULL, *dropped, *unless_evt = NULL;
|
||||
Scheme_Object *last_bytes_str = scheme_false;
|
||||
Scheme_Object *last_bytes_str = scheme_false, *srcin;
|
||||
|
||||
if (SCHEME_TYPE(argv[0]) != scheme_regexp_type
|
||||
&& !SCHEME_BYTE_STRINGP(argv[0])
|
||||
&& !SCHEME_CHAR_STRINGP(argv[0]))
|
||||
scheme_wrong_type(name, "regexp, byte-regexp, string, or byte string", 0, argc, argv);
|
||||
if ((peek || (!SCHEME_BYTE_STRINGP(argv[1]) && !SCHEME_CHAR_STRINGP(argv[1])))
|
||||
&& !SCHEME_INPUT_PORTP(argv[1]))
|
||||
scheme_wrong_type(name, peek ? "input-port" : "string, byte string, or input port", 1, argc, argv);
|
||||
&& !SCHEME_INPUT_PORTP(argv[1])
|
||||
&& !SCHEME_PATHP(argv[1]))
|
||||
scheme_wrong_type(name, peek ? "input-port" : "string, byte string, path, or input port", 1, argc, argv);
|
||||
|
||||
if (SCHEME_CHAR_STRINGP(argv[1])) {
|
||||
srcin = argv[1];
|
||||
if (SCHEME_PATHP(srcin)) {
|
||||
if (SCHEME_BYTE_STRINGP(argv[0])
|
||||
|| (SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_regexp_type)
|
||||
&& !(((regexp *)argv[0])->flags & REGEXP_IS_UTF8)))
|
||||
srcin = scheme_make_sized_byte_string(SCHEME_PATH_VAL(srcin),
|
||||
SCHEME_PATH_LEN(srcin),
|
||||
1);
|
||||
else
|
||||
srcin = scheme_path_to_char_string(srcin);
|
||||
}
|
||||
|
||||
if (SCHEME_CHAR_STRINGP(srcin)) {
|
||||
iport = NULL;
|
||||
endset = SCHEME_CHAR_STRLEN_VAL(argv[1]);
|
||||
} else if (SCHEME_INPUT_PORTP(argv[1])) {
|
||||
iport = argv[1];
|
||||
endset = SCHEME_CHAR_STRLEN_VAL(srcin);
|
||||
} else if (SCHEME_INPUT_PORTP(srcin)) {
|
||||
iport = srcin;
|
||||
endset = -2;
|
||||
} else {
|
||||
iport = NULL;
|
||||
endset = SCHEME_BYTE_STRLEN_VAL(argv[1]);
|
||||
endset = SCHEME_BYTE_STRLEN_VAL(srcin);
|
||||
}
|
||||
|
||||
if (argc > 2) {
|
||||
|
@ -5027,7 +5040,7 @@ static Scheme_Object *gen_compare(char *name, int pos,
|
|||
offset = scheme_extract_index(name, 2, argc, argv, len + 1, 0);
|
||||
|
||||
if (!iport && (offset > len)) {
|
||||
scheme_out_of_string_range(name, "offset ", argv[2], argv[1], 0, len);
|
||||
scheme_out_of_string_range(name, "offset ", argv[2], srcin, 0, len);
|
||||
return NULL;
|
||||
} else if (offset < 0) {
|
||||
/* argument was a bignum */
|
||||
|
@ -5052,7 +5065,7 @@ static Scheme_Object *gen_compare(char *name, int pos,
|
|||
return NULL;
|
||||
}
|
||||
} else if (endset < offset || endset > len) {
|
||||
scheme_out_of_string_range(name, "ending ", argv[3], argv[1], offset, len);
|
||||
scheme_out_of_string_range(name, "ending ", argv[3], srcin, offset, len);
|
||||
return NULL;
|
||||
}
|
||||
endv = argv[3];
|
||||
|
@ -5117,16 +5130,16 @@ static Scheme_Object *gen_compare(char *name, int pos,
|
|||
was_non_byte = 0;
|
||||
orig_offset = 0; /* extra offset */
|
||||
if (!iport) {
|
||||
if (SCHEME_BYTE_STRINGP(argv[1]))
|
||||
full_s = SCHEME_BYTE_STR_VAL(argv[1]);
|
||||
if (SCHEME_BYTE_STRINGP(srcin))
|
||||
full_s = SCHEME_BYTE_STR_VAL(srcin);
|
||||
else {
|
||||
/* Extract substring and UTF-8 encode: */
|
||||
int blen;
|
||||
blen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), offset, endset,
|
||||
blen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset,
|
||||
NULL, 0,
|
||||
0 /* not UTF-16 */);
|
||||
full_s = (char *)scheme_malloc_atomic(blen);
|
||||
scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), offset, endset,
|
||||
scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset,
|
||||
(unsigned char *)full_s, 0,
|
||||
0 /* not UTF-16 */);
|
||||
orig_offset = offset;
|
||||
|
@ -5136,7 +5149,7 @@ static Scheme_Object *gen_compare(char *name, int pos,
|
|||
was_non_byte = 1;
|
||||
else {
|
||||
/* Convert orig_offset into encoded bytes */
|
||||
orig_offset = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), 0, orig_offset,
|
||||
orig_offset = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), 0, orig_offset,
|
||||
NULL, 0,
|
||||
0);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue
Block a user