From acb98cef60d05970c628cf610361f2a4d705ba50 Mon Sep 17 00:00:00 2001 From: Matthew Flatt Date: Tue, 11 May 2010 06:03:39 -0600 Subject: [PATCH] allow paths as input to regexp-matching functions --- collects/racket/private/string.rkt | 21 ++++++++-- collects/scribblings/reference/regexps.scrbl | 36 +++++++++------- src/racket/src/regexp.c | 43 +++++++++++++------- 3 files changed, 66 insertions(+), 34 deletions(-) diff --git a/collects/racket/private/string.rkt b/collects/racket/private/string.rkt index 784d23ef3d..c5c692d190 100644 --- a/collects/racket/private/string.rkt +++ b/collects/racket/private/string.rkt @@ -126,7 +126,12 @@ success-choose failure-k port-success-k port-success-choose port-failure-k need-leftover? peek?) - (let* ([len (cond [(string? string) (string-length string)] + (let* ([string (if (path? string) + (if (or (string? pattern) (regexp? pattern)) + (path->string string) + (path->bytes string)) + string)] + [len (cond [(string? string) (string-length string)] [(bytes? string) (bytes-length string)] [else #f])] [orig-rx (cond [(bytes? pattern) (byte-regexp pattern)] @@ -143,7 +148,7 @@ (raise-type-error 'name "input port" string)) (unless (or len (input-port? string)) (raise-type-error - 'name "string, byte string or input port" string))) + 'name "string, byte string, path, or input port" string))) (unless (and (number? start) (exact? start) (integer? start) (start . >= . 0)) (raise-type-error 'name "non-negative exact integer" start)) @@ -283,7 +288,11 @@ (bytes? pattern))) (string->bytes/utf-8 string (char->integer #\?)) string)) - (define sub (if (bytes? buf) subbytes substring)) + (define sub (if (or (bytes? buf) (and (path? string) + (or (bytes? pattern) + (byte-regexp? pattern)))) + subbytes + substring)) (regexp-loop regexp-split loop start end pattern buf ipre ;; success-choose: (lambda (start mstart mend ms acc) (cons (sub buf start mstart) acc)) @@ -418,7 +427,11 @@ (bytes? pattern))) (string->bytes/utf-8 string (char->integer #\?)) string)) - (define sub (if (bytes? buf) subbytes substring)) + (define sub (if (or (bytes? buf) (and (path? string) + (or (bytes? pattern) + (byte-regexp? pattern)))) + subbytes + substring)) (regexp-loop regexp-match* loop start end pattern buf ipre ;; success-choose: (lambda (start mstart mend ms acc) (cons (sub buf mstart mend) acc)) diff --git a/collects/scribblings/reference/regexps.scrbl b/collects/scribblings/reference/regexps.scrbl index 53d418025b..cd9be5c3b6 100644 --- a/collects/scribblings/reference/regexps.scrbl +++ b/collects/scribblings/reference/regexps.scrbl @@ -210,20 +210,26 @@ the start of the input or of a line.} @section{Regexp Matching} @defproc[(regexp-match [pattern (or/c string? bytes? regexp? byte-regexp?)] - [input (or/c string? bytes? input-port?)] + [input (or/c string? bytes? path? input-port?)] [start-pos exact-nonnegative-integer? 0] [end-pos (or/c exact-nonnegative-integer? #f) #f] [output-port (or/c output-port? #f) #f] [input-prefix bytes? #""]) (if (and (or (string? pattern) (regexp? pattern)) - (string? input)) + (or (string? input) (path? input))) (or/c #f (cons/c string? (listof (or/c string? #f)))) (or/c #f (cons/c bytes? (listof (or/c bytes? #f)))))]{ -Attempts to match @racket[pattern] (a string, byte string, @tech{regexp -value}, or byte-@tech{regexp value}) once to a portion of @racket[input]. The -matcher finds a portion of @racket[input] that matches and is closest -to the start of the input (after @racket[start-pos]). +Attempts to match @racket[pattern] (a string, byte string, +@tech{regexp value}, or byte-@tech{regexp value}) once to a portion of +@racket[input]. The matcher finds a portion of @racket[input] that +matches and is closest to the start of the input (after +@racket[start-pos]). + +If @racket[input] is a path, it is converted to a byte string with +@racket[path->bytes] if @racket[pattern] is a byte string or a +byte-based regexp. Otherwise, @racket[input] is converted to a string +with @racket[path->string]. The optional @racket[start-pos] and @racket[end-pos] arguments select a portion of @racket[input] for matching; the default is the entire @@ -320,12 +326,12 @@ bytes. To avoid such interleaving, use @racket[regexp-match-peek] @defproc[(regexp-match* [pattern (or/c string? bytes? regexp? byte-regexp?)] - [input (or/c string? bytes? input-port?)] + [input (or/c string? bytes? path? input-port?)] [start-pos exact-nonnegative-integer? 0] [end-pos (or/c exact-nonnegative-integer? #f) #f] [input-prefix bytes? #""]) (if (and (or (string? pattern) (regexp? pattern)) - (string? input)) + (or (string? input) (path? input))) (listof string?) (listof bytes?))]{ @@ -381,7 +387,7 @@ fails.} @defproc[(regexp-match-positions [pattern (or/c string? bytes? regexp? byte-regexp?)] - [input (or/c string? bytes? input-port?)] + [input (or/c string? bytes? path? input-port?)] [start-pos exact-nonnegative-integer? 0] [end-pos (or/c exact-nonnegative-integer? #f) #f] [output-port (or/c output-port? #f) #f] @@ -414,7 +420,7 @@ positions indicate the number of bytes that were read, including ]} @defproc[(regexp-match-positions* [pattern (or/c string? bytes? regexp? byte-regexp?)] - [input (or/c string? bytes? input-port?)] + [input (or/c string? bytes? path? input-port?)] [start-pos exact-nonnegative-integer? 0] [end-pos (or/c exact-nonnegative-integer? #f) #f] [input-prefix bytes? #""]) @@ -430,7 +436,7 @@ like @racket[regexp-match*]. @defproc[(regexp-match? [pattern (or/c string? bytes? regexp? byte-regexp?)] - [input (or/c string? bytes? input-port?)] + [input (or/c string? bytes? path? input-port?)] [start-pos exact-nonnegative-integer? 0] [end-pos (or/c exact-nonnegative-integer? #f) #f] [output-port (or/c output-port? #f) #f] @@ -447,7 +453,7 @@ match succeeds, @racket[#f] otherwise. @defproc[(regexp-match-exact? [pattern (or/c string? bytes? regexp? byte-regexp?)] - [input (or/c string? bytes? input-port?)]) + [input (or/c string? bytes? path? input-port?)]) boolean?]{ Like @racket[regexp-match?], but @racket[#t] is only returned when the @@ -553,7 +559,7 @@ Like @racket[regexp-match-peek-positions], but returns multiple matches like @racket[regexp-match*].} @defproc[(regexp-match/end [pattern (or/c string? bytes? regexp? byte-regexp?)] - [input (or/c string? bytes? input-port?)] + [input (or/c string? bytes? path? input-port?)] [start-pos exact-nonnegative-integer? 0] [end-pos (or/c exact-nonnegative-integer? #f) #f] [output-port (or/c output-port? #f) #f] @@ -561,7 +567,7 @@ Like @racket[regexp-match-peek-positions], but returns multiple matches like [count nonnegative-exact-integer? 1]) (values (if (and (or (string? pattern) (regexp? pattern)) - (string? input)) + (or/c (string? input) (path? input))) (or/c #f (cons/c string? (listof (or/c string? #f)))) (or/c #f (cons/c bytes? (listof (or/c bytes? #f))))) (or/c #f bytes?))]{ @@ -578,7 +584,7 @@ to determine an appropriate value for @racket[count].} @deftogether[( @defproc[(regexp-match-positions/end [pattern (or/c string? bytes? regexp? byte-regexp?)] - [input (or/c string? bytes? input-port?)] + [input (or/c string? bytes? path? input-port?)] [start-pos exact-nonnegative-integer? 0] [end-pos (or/c exact-nonnegative-integer? #f) #f] [input-prefix bytes? #""] diff --git a/src/racket/src/regexp.c b/src/racket/src/regexp.c index efc4b7bec6..9279c23f32 100644 --- a/src/racket/src/regexp.c +++ b/src/racket/src/regexp.c @@ -5000,25 +5000,38 @@ static Scheme_Object *gen_compare(char *name, int pos, rxpos *startp, *maybep, *endp, prefix_len = 0, prefix_offset = 0, minpos; int offset = 0, orig_offset, endset, m, was_non_byte, last_bytes_count = last_bytes; Scheme_Object *iport, *oport = NULL, *startv = NULL, *endv = NULL, *dropped, *unless_evt = NULL; - Scheme_Object *last_bytes_str = scheme_false; + Scheme_Object *last_bytes_str = scheme_false, *srcin; if (SCHEME_TYPE(argv[0]) != scheme_regexp_type && !SCHEME_BYTE_STRINGP(argv[0]) && !SCHEME_CHAR_STRINGP(argv[0])) scheme_wrong_type(name, "regexp, byte-regexp, string, or byte string", 0, argc, argv); if ((peek || (!SCHEME_BYTE_STRINGP(argv[1]) && !SCHEME_CHAR_STRINGP(argv[1]))) - && !SCHEME_INPUT_PORTP(argv[1])) - scheme_wrong_type(name, peek ? "input-port" : "string, byte string, or input port", 1, argc, argv); + && !SCHEME_INPUT_PORTP(argv[1]) + && !SCHEME_PATHP(argv[1])) + scheme_wrong_type(name, peek ? "input-port" : "string, byte string, path, or input port", 1, argc, argv); - if (SCHEME_CHAR_STRINGP(argv[1])) { + srcin = argv[1]; + if (SCHEME_PATHP(srcin)) { + if (SCHEME_BYTE_STRINGP(argv[0]) + || (SAME_TYPE(SCHEME_TYPE(argv[0]), scheme_regexp_type) + && !(((regexp *)argv[0])->flags & REGEXP_IS_UTF8))) + srcin = scheme_make_sized_byte_string(SCHEME_PATH_VAL(srcin), + SCHEME_PATH_LEN(srcin), + 1); + else + srcin = scheme_path_to_char_string(srcin); + } + + if (SCHEME_CHAR_STRINGP(srcin)) { iport = NULL; - endset = SCHEME_CHAR_STRLEN_VAL(argv[1]); - } else if (SCHEME_INPUT_PORTP(argv[1])) { - iport = argv[1]; + endset = SCHEME_CHAR_STRLEN_VAL(srcin); + } else if (SCHEME_INPUT_PORTP(srcin)) { + iport = srcin; endset = -2; } else { iport = NULL; - endset = SCHEME_BYTE_STRLEN_VAL(argv[1]); + endset = SCHEME_BYTE_STRLEN_VAL(srcin); } if (argc > 2) { @@ -5027,7 +5040,7 @@ static Scheme_Object *gen_compare(char *name, int pos, offset = scheme_extract_index(name, 2, argc, argv, len + 1, 0); if (!iport && (offset > len)) { - scheme_out_of_string_range(name, "offset ", argv[2], argv[1], 0, len); + scheme_out_of_string_range(name, "offset ", argv[2], srcin, 0, len); return NULL; } else if (offset < 0) { /* argument was a bignum */ @@ -5052,7 +5065,7 @@ static Scheme_Object *gen_compare(char *name, int pos, return NULL; } } else if (endset < offset || endset > len) { - scheme_out_of_string_range(name, "ending ", argv[3], argv[1], offset, len); + scheme_out_of_string_range(name, "ending ", argv[3], srcin, offset, len); return NULL; } endv = argv[3]; @@ -5117,16 +5130,16 @@ static Scheme_Object *gen_compare(char *name, int pos, was_non_byte = 0; orig_offset = 0; /* extra offset */ if (!iport) { - if (SCHEME_BYTE_STRINGP(argv[1])) - full_s = SCHEME_BYTE_STR_VAL(argv[1]); + if (SCHEME_BYTE_STRINGP(srcin)) + full_s = SCHEME_BYTE_STR_VAL(srcin); else { /* Extract substring and UTF-8 encode: */ int blen; - blen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), offset, endset, + blen = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset, NULL, 0, 0 /* not UTF-16 */); full_s = (char *)scheme_malloc_atomic(blen); - scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), offset, endset, + scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), offset, endset, (unsigned char *)full_s, 0, 0 /* not UTF-16 */); orig_offset = offset; @@ -5136,7 +5149,7 @@ static Scheme_Object *gen_compare(char *name, int pos, was_non_byte = 1; else { /* Convert orig_offset into encoded bytes */ - orig_offset = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(argv[1]), 0, orig_offset, + orig_offset = scheme_utf8_encode(SCHEME_CHAR_STR_VAL(srcin), 0, orig_offset, NULL, 0, 0); }