Try harder to find usable README links. Closes #14.

This commit is contained in:
Tony Garnock-Jones 2016-08-14 12:16:06 -04:00
parent 0c557fb3a0
commit 2afc337396
2 changed files with 232 additions and 27 deletions

151
src/http-utils.rkt Normal file
View File

@ -0,0 +1,151 @@
#lang racket/base
;; HTTP utilities
(provide http-redirection-limit
http-classify-status-code
http-interpret-response
http-simple-interpret-response
http-follow-redirects
http-sendrecv/url
http/interpret-response
http/simple-interpret-response
http/follow-redirects)
(require (only-in racket/port port->bytes))
(require (only-in racket/bytes bytes-join))
(require racket/match)
(require net/http-client)
(require net/head)
(require (except-in net/url http-sendrecv/url))
;; (Parameterof Number)
;; Number of redirections to automatically follow when retrieving via GET or HEAD.
(define http-redirection-limit (make-parameter 20))
;; Number -> Symbol
;; Returns the broad classification associated with a given HTTP status code.
(define (http-classify-status-code status-code)
(cond
[(<= status-code 99) 'unknown]
[(<= 100 status-code 199) 'informational]
[(<= 200 status-code 299) 'success]
[(<= 300 status-code 399) 'redirection]
[(<= 400 status-code 499) 'client-error]
[(<= 500 status-code 599) 'server-error]
[(<= 600 status-code) 'unknown]))
(define (parse-status-line status-line)
(match status-line
[(regexp #px#"^([^ ]+) ([^ ]+)( (.*))?$" (list _ v c _ r))
(values v (string->number (bytes->string/latin-1 c)) (bytes->string/latin-1 r))]
[_
(values #f #f #f)]))
(define (parse-headers response-headers [downcase-header-names? #t])
(for/list [(h (extract-all-fields (bytes-join response-headers #"\r\n")))]
(cons (string->symbol ((if downcase-header-names? string-downcase values)
(bytes->string/latin-1 (car h))))
(cdr h))))
;; <customizations ...>
;; -> Bytes (Listof Bytes) InputPort
;; -> (Values (Option Bytes)
;; (Option Number)
;; (Option String)
;; (Listof (Cons Symbol Bytes))
;; (if read-body? Bytes InputPort))
(define ((http-interpret-response #:downcase-header-names? [downcase-header-names? #t]
#:read-body? [read-body? #t])
status-line response-headers response-body-port)
(define-values (http-version status-code reason-phrase) (parse-status-line status-line))
(values http-version
status-code
reason-phrase
(parse-headers response-headers downcase-header-names?)
(if read-body?
(begin0 (port->bytes response-body-port)
(close-input-port response-body-port))
response-body-port)))
(define (http-simple-interpret-response status-line response-headers response-body-port)
(define-values (_http-version
status-code
_reason-phrase
headers
body)
((http-interpret-response) status-line response-headers response-body-port))
(values (http-classify-status-code status-code)
headers
body))
(define ((http-follow-redirects method
#:version [version #"1.1"])
status-line
response-headers
response-body-port)
(define ((check-response remaining-redirect-count)
status-line
response-headers
response-body-port)
(log-debug "http-follow-redirects: Checking request result: ~a\n" status-line)
(define-values (http-version status-code reason-phrase) (parse-status-line status-line))
(if (and (positive? remaining-redirect-count)
(eq? (http-classify-status-code status-code) 'redirection))
(match (assq 'location (parse-headers response-headers))
[#f (values status-line response-headers response-body-port)]
[(cons _location-header-label location-urlbytes)
(define location (string->url (bytes->string/latin-1 location-urlbytes)))
(void (port->bytes response-body-port)) ;; consume and discard input
(close-input-port response-body-port)
(log-debug "http-follow-redirects: Following redirection to ~a\n" location-urlbytes)
(call-with-values (lambda () (http-sendrecv/url location
#:version version
#:method method))
(check-response (- remaining-redirect-count 1)))])
(values status-line response-headers response-body-port)))
((check-response (http-redirection-limit))
status-line
response-headers
response-body-port))
;; Already present in net/url, but that variant doesn't take #:version
;; or allow overriding of #:ssl? and #:port.
(define (http-sendrecv/url u
#:ssl? [ssl? (equal? (url-scheme u) "https")]
#:port [port (or (url-port u) (if ssl? 443 80))]
#:version [version #"1.1"]
#:method [method #"GET"]
#:headers [headers '()]
#:data [data #f]
#:content-decode [decodes '(gzip)])
(http-sendrecv (url-host u)
(url->string u)
#:ssl? ssl?
#:port port
#:version version
#:method method
#:headers headers
#:data data
#:content-decode decodes))
(define-syntax-rule (http/interpret-response customization ... req-expr)
(call-with-values (lambda () req-expr)
(http-interpret-response customization ...)))
(define-syntax-rule (http/simple-interpret-response req-expr)
(call-with-values (lambda () req-expr)
http-simple-interpret-response))
(define-syntax-rule (http/follow-redirects customization ... req-expr)
(call-with-values (lambda () req-expr)
(http-follow-redirects customization ...)))
(module+ test
(require rackunit)
(http/simple-interpret-response
(http/follow-redirects
#"HEAD"
(http-sendrecv/url (string->url "http://google.com/") #:method #"HEAD")))
)

View File

@ -11,11 +11,11 @@
(require racket/date)
(require racket/string)
(require racket/port)
(require (only-in racket/list filter-map))
(require (only-in racket/list filter-map drop-right))
(require (only-in racket/exn exn->string))
(require net/url)
(require (except-in net/url http-sendrecv/url))
(require net/uri-codec)
(require web-server/servlet)
(require (except-in web-server/servlet http-sendrecv/url))
(require json)
(require "gravatar.rkt")
(require "bootstrap.rkt")
@ -29,6 +29,7 @@
(require "hash-utils.rkt")
(require "static.rkt")
(require "package-source.rkt")
(require "http-utils.rkt")
(define static-urlprefix
(or (@ (config) static-urlprefix)
@ -1417,32 +1418,85 @@
(define default-version (package-default-version pkg))
(define external-information
(and pkg
(if (equal? (@ default-version source)
(@ default-version source_url))
;; We don't know where to look for a readme.
(hash)
;; It's probably a github-like repo. Check for a readme.
(let ((contents
(with-handlers ([exn:fail:network?
(lambda (e)
(log-warning
"Network error retrieving possible readme for ~a:\n~a"
package-name
(exn->string e))
"")])
(define the-port
(get-pure-port (string->url (@ default-version source_url))
#:redirections 10))
(begin0 (port->string the-port)
(close-input-port the-port)))))
;;(log-info "CONTENTS: ~a === ~a" (@ default-version source_url) contents)
(if (regexp-match? #px"(?i:id=.readme.)" contents)
(let ((readme-url (string-append (@ default-version source_url) "#readme")))
(log-info "Package ~a has a readme at ~a" package-name readme-url)
(hash 'readme-url readme-url))
(hash))))))
(match (source->readme-url (@ default-version source))
[#f (hash)]
[readme-url
(log-info "Package ~a has a readme at ~a" package-name readme-url)
(hash 'readme-url readme-url)])))
(set-package-external-information! package-name external-information))
;; String -> (Option String)
;;
;; Attempt to discover a "nearby" README for a given git "source URL",
;; operating on the assumption that we have a vaguely github-like
;; setup. We can do better here once we get a feel for what other
;; possibilities exist out there, and how we can abstract over them.
;;
(define (source->readme-url s)
;; SYNTAX
;; If exn:fail:network is raised, logs a warning and returns #f
(define-syntax-rule (ignore-network-errors body ...)
(with-handlers ([exn:fail:network?
(lambda (e)
(log-warning
"Network error retrieving possible readme for source URL ~a:\n~a"
s
(exn->string e))
#f)])
body ...))
;; URL -> (Option String)
;; Helper: Check for a "README.md" resource as a subresource of the
;; given URL. Return the README's URL if it is found; otherwise #f.
(define (extant-readme-md-urlstring u)
(ignore-network-errors
(define readme-u (struct-copy url u
[path (append (url-path u) (list (path/param "README.md" '())))]))
(log-info "Checking for readme at ~a ..." (url->string readme-u))
(match/values (http/simple-interpret-response
(http/follow-redirects
#"HEAD"
(http-sendrecv/url readme-u #:method #"HEAD")))
[('success _headers _body) (url->string readme-u)]
[(_ _ _) #f])))
;; URL -> (Option String)
;; Helper: Retrieves the given resource and greps it for
;; id="readme", more or less, to determine whether there's a usable
;; fragment there.
(define (horrible-readme-scraping-hack u)
(ignore-network-errors
(log-info "Checking for readme fragment at ~a ..." (url->string u))
(match/values (http/simple-interpret-response
(http/follow-redirects
#"GET"
(http-sendrecv/url u #:method #"GET")))
[('success _headers body)
(and (regexp-match? #px"(?i:id=.readme.)" body)
(string-append (url->string u) "#readme"))]
[(_ _ _) #f])))
(define-values (p _complaints) (parse-package-source s))
(and (git-source? p)
;; Search from the location given up into parent directories
;; until we reach the repo root.
(let* ((root-p (struct-copy git-source p [path ""]))
(root-u (string->url (parsed-package-source-human-tree-url root-p)))
(here-u (string->url (parsed-package-source-human-tree-url p))))
(and (member (url-scheme here-u) (list "http" "https"))
(let loop ((here-u here-u))
;; Strategy: Try to directly retrieve "README.md"
;; first. In principle, we could/should try other
;; github-supported READMEish names here, but if this
;; first check fails we go for a horrible
;; content-scraping strategy instead.
(or (extant-readme-md-urlstring here-u)
(horrible-readme-scraping-hack here-u)
(and (not (equal? here-u root-u))
(loop (struct-copy url here-u
[path (drop-right (url-path here-u) 1)])))))))))
(define (rerender! items-to-rerender)
(thread-send (package-change-handler-thread) (list 'rerender! items-to-rerender)))