From 2afc337396691f4b80b430e9df1a0adc4030a225 Mon Sep 17 00:00:00 2001 From: Tony Garnock-Jones Date: Sun, 14 Aug 2016 12:16:06 -0400 Subject: [PATCH] Try harder to find usable README links. Closes #14. --- src/http-utils.rkt | 151 +++++++++++++++++++++++++++++++++++++++++++++ src/site.rkt | 108 ++++++++++++++++++++++++-------- 2 files changed, 232 insertions(+), 27 deletions(-) create mode 100644 src/http-utils.rkt diff --git a/src/http-utils.rkt b/src/http-utils.rkt new file mode 100644 index 0000000..ac5c832 --- /dev/null +++ b/src/http-utils.rkt @@ -0,0 +1,151 @@ +#lang racket/base +;; HTTP utilities + +(provide http-redirection-limit + http-classify-status-code + http-interpret-response + http-simple-interpret-response + http-follow-redirects + http-sendrecv/url + + http/interpret-response + http/simple-interpret-response + http/follow-redirects) + +(require (only-in racket/port port->bytes)) +(require (only-in racket/bytes bytes-join)) +(require racket/match) +(require net/http-client) +(require net/head) +(require (except-in net/url http-sendrecv/url)) + +;; (Parameterof Number) +;; Number of redirections to automatically follow when retrieving via GET or HEAD. +(define http-redirection-limit (make-parameter 20)) + +;; Number -> Symbol +;; Returns the broad classification associated with a given HTTP status code. +(define (http-classify-status-code status-code) + (cond + [(<= status-code 99) 'unknown] + [(<= 100 status-code 199) 'informational] + [(<= 200 status-code 299) 'success] + [(<= 300 status-code 399) 'redirection] + [(<= 400 status-code 499) 'client-error] + [(<= 500 status-code 599) 'server-error] + [(<= 600 status-code) 'unknown])) + +(define (parse-status-line status-line) + (match status-line + [(regexp #px#"^([^ ]+) ([^ ]+)( (.*))?$" (list _ v c _ r)) + (values v (string->number (bytes->string/latin-1 c)) (bytes->string/latin-1 r))] + [_ + (values #f #f #f)])) + +(define (parse-headers response-headers [downcase-header-names? #t]) + (for/list [(h (extract-all-fields (bytes-join response-headers #"\r\n")))] + (cons (string->symbol ((if downcase-header-names? string-downcase values) + (bytes->string/latin-1 (car h)))) + (cdr h)))) + +;; +;; -> Bytes (Listof Bytes) InputPort +;; -> (Values (Option Bytes) +;; (Option Number) +;; (Option String) +;; (Listof (Cons Symbol Bytes)) +;; (if read-body? Bytes InputPort)) +(define ((http-interpret-response #:downcase-header-names? [downcase-header-names? #t] + #:read-body? [read-body? #t]) + status-line response-headers response-body-port) + (define-values (http-version status-code reason-phrase) (parse-status-line status-line)) + (values http-version + status-code + reason-phrase + (parse-headers response-headers downcase-header-names?) + (if read-body? + (begin0 (port->bytes response-body-port) + (close-input-port response-body-port)) + response-body-port))) + +(define (http-simple-interpret-response status-line response-headers response-body-port) + (define-values (_http-version + status-code + _reason-phrase + headers + body) + ((http-interpret-response) status-line response-headers response-body-port)) + (values (http-classify-status-code status-code) + headers + body)) + +(define ((http-follow-redirects method + #:version [version #"1.1"]) + status-line + response-headers + response-body-port) + (define ((check-response remaining-redirect-count) + status-line + response-headers + response-body-port) + (log-debug "http-follow-redirects: Checking request result: ~a\n" status-line) + (define-values (http-version status-code reason-phrase) (parse-status-line status-line)) + (if (and (positive? remaining-redirect-count) + (eq? (http-classify-status-code status-code) 'redirection)) + (match (assq 'location (parse-headers response-headers)) + [#f (values status-line response-headers response-body-port)] + [(cons _location-header-label location-urlbytes) + (define location (string->url (bytes->string/latin-1 location-urlbytes))) + (void (port->bytes response-body-port)) ;; consume and discard input + (close-input-port response-body-port) + (log-debug "http-follow-redirects: Following redirection to ~a\n" location-urlbytes) + (call-with-values (lambda () (http-sendrecv/url location + #:version version + #:method method)) + (check-response (- remaining-redirect-count 1)))]) + (values status-line response-headers response-body-port))) + ((check-response (http-redirection-limit)) + status-line + response-headers + response-body-port)) + +;; Already present in net/url, but that variant doesn't take #:version +;; or allow overriding of #:ssl? and #:port. +(define (http-sendrecv/url u + #:ssl? [ssl? (equal? (url-scheme u) "https")] + #:port [port (or (url-port u) (if ssl? 443 80))] + #:version [version #"1.1"] + #:method [method #"GET"] + #:headers [headers '()] + #:data [data #f] + #:content-decode [decodes '(gzip)]) + (http-sendrecv (url-host u) + (url->string u) + #:ssl? ssl? + #:port port + #:version version + #:method method + #:headers headers + #:data data + #:content-decode decodes)) + +(define-syntax-rule (http/interpret-response customization ... req-expr) + (call-with-values (lambda () req-expr) + (http-interpret-response customization ...))) + +(define-syntax-rule (http/simple-interpret-response req-expr) + (call-with-values (lambda () req-expr) + http-simple-interpret-response)) + +(define-syntax-rule (http/follow-redirects customization ... req-expr) + (call-with-values (lambda () req-expr) + (http-follow-redirects customization ...))) + +(module+ test + (require rackunit) + + (http/simple-interpret-response + (http/follow-redirects + #"HEAD" + (http-sendrecv/url (string->url "http://google.com/") #:method #"HEAD"))) + ) diff --git a/src/site.rkt b/src/site.rkt index 8025348..5030a63 100644 --- a/src/site.rkt +++ b/src/site.rkt @@ -11,11 +11,11 @@ (require racket/date) (require racket/string) (require racket/port) -(require (only-in racket/list filter-map)) +(require (only-in racket/list filter-map drop-right)) (require (only-in racket/exn exn->string)) -(require net/url) +(require (except-in net/url http-sendrecv/url)) (require net/uri-codec) -(require web-server/servlet) +(require (except-in web-server/servlet http-sendrecv/url)) (require json) (require "gravatar.rkt") (require "bootstrap.rkt") @@ -29,6 +29,7 @@ (require "hash-utils.rkt") (require "static.rkt") (require "package-source.rkt") +(require "http-utils.rkt") (define static-urlprefix (or (@ (config) static-urlprefix) @@ -1417,32 +1418,85 @@ (define default-version (package-default-version pkg)) (define external-information (and pkg - (if (equal? (@ default-version source) - (@ default-version source_url)) - ;; We don't know where to look for a readme. - (hash) - ;; It's probably a github-like repo. Check for a readme. - (let ((contents - (with-handlers ([exn:fail:network? - (lambda (e) - (log-warning - "Network error retrieving possible readme for ~a:\n~a" - package-name - (exn->string e)) - "")]) - (define the-port - (get-pure-port (string->url (@ default-version source_url)) - #:redirections 10)) - (begin0 (port->string the-port) - (close-input-port the-port))))) - ;;(log-info "CONTENTS: ~a === ~a" (@ default-version source_url) contents) - (if (regexp-match? #px"(?i:id=.readme.)" contents) - (let ((readme-url (string-append (@ default-version source_url) "#readme"))) - (log-info "Package ~a has a readme at ~a" package-name readme-url) - (hash 'readme-url readme-url)) - (hash)))))) + (match (source->readme-url (@ default-version source)) + [#f (hash)] + [readme-url + (log-info "Package ~a has a readme at ~a" package-name readme-url) + (hash 'readme-url readme-url)]))) (set-package-external-information! package-name external-information)) +;; String -> (Option String) +;; +;; Attempt to discover a "nearby" README for a given git "source URL", +;; operating on the assumption that we have a vaguely github-like +;; setup. We can do better here once we get a feel for what other +;; possibilities exist out there, and how we can abstract over them. +;; +(define (source->readme-url s) + + ;; SYNTAX + ;; If exn:fail:network is raised, logs a warning and returns #f + (define-syntax-rule (ignore-network-errors body ...) + (with-handlers ([exn:fail:network? + (lambda (e) + (log-warning + "Network error retrieving possible readme for source URL ~a:\n~a" + s + (exn->string e)) + #f)]) + body ...)) + + ;; URL -> (Option String) + ;; Helper: Check for a "README.md" resource as a subresource of the + ;; given URL. Return the README's URL if it is found; otherwise #f. + (define (extant-readme-md-urlstring u) + (ignore-network-errors + (define readme-u (struct-copy url u + [path (append (url-path u) (list (path/param "README.md" '())))])) + (log-info "Checking for readme at ~a ..." (url->string readme-u)) + (match/values (http/simple-interpret-response + (http/follow-redirects + #"HEAD" + (http-sendrecv/url readme-u #:method #"HEAD"))) + [('success _headers _body) (url->string readme-u)] + [(_ _ _) #f]))) + + ;; URL -> (Option String) + ;; Helper: Retrieves the given resource and greps it for + ;; id="readme", more or less, to determine whether there's a usable + ;; fragment there. + (define (horrible-readme-scraping-hack u) + (ignore-network-errors + (log-info "Checking for readme fragment at ~a ..." (url->string u)) + (match/values (http/simple-interpret-response + (http/follow-redirects + #"GET" + (http-sendrecv/url u #:method #"GET"))) + [('success _headers body) + (and (regexp-match? #px"(?i:id=.readme.)" body) + (string-append (url->string u) "#readme"))] + [(_ _ _) #f]))) + + (define-values (p _complaints) (parse-package-source s)) + (and (git-source? p) + ;; Search from the location given up into parent directories + ;; until we reach the repo root. + (let* ((root-p (struct-copy git-source p [path ""])) + (root-u (string->url (parsed-package-source-human-tree-url root-p))) + (here-u (string->url (parsed-package-source-human-tree-url p)))) + (and (member (url-scheme here-u) (list "http" "https")) + (let loop ((here-u here-u)) + ;; Strategy: Try to directly retrieve "README.md" + ;; first. In principle, we could/should try other + ;; github-supported READMEish names here, but if this + ;; first check fails we go for a horrible + ;; content-scraping strategy instead. + (or (extant-readme-md-urlstring here-u) + (horrible-readme-scraping-hack here-u) + (and (not (equal? here-u root-u)) + (loop (struct-copy url here-u + [path (drop-right (url-path here-u) 1)]))))))))) + (define (rerender! items-to-rerender) (thread-send (package-change-handler-thread) (list 'rerender! items-to-rerender)))