Try harder to find usable README links. Closes #14.
This commit is contained in:
parent
0c557fb3a0
commit
2afc337396
151
src/http-utils.rkt
Normal file
151
src/http-utils.rkt
Normal file
|
@ -0,0 +1,151 @@
|
|||
#lang racket/base
|
||||
;; HTTP utilities
|
||||
|
||||
(provide http-redirection-limit
|
||||
http-classify-status-code
|
||||
http-interpret-response
|
||||
http-simple-interpret-response
|
||||
http-follow-redirects
|
||||
http-sendrecv/url
|
||||
|
||||
http/interpret-response
|
||||
http/simple-interpret-response
|
||||
http/follow-redirects)
|
||||
|
||||
(require (only-in racket/port port->bytes))
|
||||
(require (only-in racket/bytes bytes-join))
|
||||
(require racket/match)
|
||||
(require net/http-client)
|
||||
(require net/head)
|
||||
(require (except-in net/url http-sendrecv/url))
|
||||
|
||||
;; (Parameterof Number)
|
||||
;; Number of redirections to automatically follow when retrieving via GET or HEAD.
|
||||
(define http-redirection-limit (make-parameter 20))
|
||||
|
||||
;; Number -> Symbol
|
||||
;; Returns the broad classification associated with a given HTTP status code.
|
||||
(define (http-classify-status-code status-code)
|
||||
(cond
|
||||
[(<= status-code 99) 'unknown]
|
||||
[(<= 100 status-code 199) 'informational]
|
||||
[(<= 200 status-code 299) 'success]
|
||||
[(<= 300 status-code 399) 'redirection]
|
||||
[(<= 400 status-code 499) 'client-error]
|
||||
[(<= 500 status-code 599) 'server-error]
|
||||
[(<= 600 status-code) 'unknown]))
|
||||
|
||||
(define (parse-status-line status-line)
|
||||
(match status-line
|
||||
[(regexp #px#"^([^ ]+) ([^ ]+)( (.*))?$" (list _ v c _ r))
|
||||
(values v (string->number (bytes->string/latin-1 c)) (bytes->string/latin-1 r))]
|
||||
[_
|
||||
(values #f #f #f)]))
|
||||
|
||||
(define (parse-headers response-headers [downcase-header-names? #t])
|
||||
(for/list [(h (extract-all-fields (bytes-join response-headers #"\r\n")))]
|
||||
(cons (string->symbol ((if downcase-header-names? string-downcase values)
|
||||
(bytes->string/latin-1 (car h))))
|
||||
(cdr h))))
|
||||
|
||||
;; <customizations ...>
|
||||
;; -> Bytes (Listof Bytes) InputPort
|
||||
;; -> (Values (Option Bytes)
|
||||
;; (Option Number)
|
||||
;; (Option String)
|
||||
;; (Listof (Cons Symbol Bytes))
|
||||
;; (if read-body? Bytes InputPort))
|
||||
(define ((http-interpret-response #:downcase-header-names? [downcase-header-names? #t]
|
||||
#:read-body? [read-body? #t])
|
||||
status-line response-headers response-body-port)
|
||||
(define-values (http-version status-code reason-phrase) (parse-status-line status-line))
|
||||
(values http-version
|
||||
status-code
|
||||
reason-phrase
|
||||
(parse-headers response-headers downcase-header-names?)
|
||||
(if read-body?
|
||||
(begin0 (port->bytes response-body-port)
|
||||
(close-input-port response-body-port))
|
||||
response-body-port)))
|
||||
|
||||
(define (http-simple-interpret-response status-line response-headers response-body-port)
|
||||
(define-values (_http-version
|
||||
status-code
|
||||
_reason-phrase
|
||||
headers
|
||||
body)
|
||||
((http-interpret-response) status-line response-headers response-body-port))
|
||||
(values (http-classify-status-code status-code)
|
||||
headers
|
||||
body))
|
||||
|
||||
(define ((http-follow-redirects method
|
||||
#:version [version #"1.1"])
|
||||
status-line
|
||||
response-headers
|
||||
response-body-port)
|
||||
(define ((check-response remaining-redirect-count)
|
||||
status-line
|
||||
response-headers
|
||||
response-body-port)
|
||||
(log-debug "http-follow-redirects: Checking request result: ~a\n" status-line)
|
||||
(define-values (http-version status-code reason-phrase) (parse-status-line status-line))
|
||||
(if (and (positive? remaining-redirect-count)
|
||||
(eq? (http-classify-status-code status-code) 'redirection))
|
||||
(match (assq 'location (parse-headers response-headers))
|
||||
[#f (values status-line response-headers response-body-port)]
|
||||
[(cons _location-header-label location-urlbytes)
|
||||
(define location (string->url (bytes->string/latin-1 location-urlbytes)))
|
||||
(void (port->bytes response-body-port)) ;; consume and discard input
|
||||
(close-input-port response-body-port)
|
||||
(log-debug "http-follow-redirects: Following redirection to ~a\n" location-urlbytes)
|
||||
(call-with-values (lambda () (http-sendrecv/url location
|
||||
#:version version
|
||||
#:method method))
|
||||
(check-response (- remaining-redirect-count 1)))])
|
||||
(values status-line response-headers response-body-port)))
|
||||
((check-response (http-redirection-limit))
|
||||
status-line
|
||||
response-headers
|
||||
response-body-port))
|
||||
|
||||
;; Already present in net/url, but that variant doesn't take #:version
|
||||
;; or allow overriding of #:ssl? and #:port.
|
||||
(define (http-sendrecv/url u
|
||||
#:ssl? [ssl? (equal? (url-scheme u) "https")]
|
||||
#:port [port (or (url-port u) (if ssl? 443 80))]
|
||||
#:version [version #"1.1"]
|
||||
#:method [method #"GET"]
|
||||
#:headers [headers '()]
|
||||
#:data [data #f]
|
||||
#:content-decode [decodes '(gzip)])
|
||||
(http-sendrecv (url-host u)
|
||||
(url->string u)
|
||||
#:ssl? ssl?
|
||||
#:port port
|
||||
#:version version
|
||||
#:method method
|
||||
#:headers headers
|
||||
#:data data
|
||||
#:content-decode decodes))
|
||||
|
||||
(define-syntax-rule (http/interpret-response customization ... req-expr)
|
||||
(call-with-values (lambda () req-expr)
|
||||
(http-interpret-response customization ...)))
|
||||
|
||||
(define-syntax-rule (http/simple-interpret-response req-expr)
|
||||
(call-with-values (lambda () req-expr)
|
||||
http-simple-interpret-response))
|
||||
|
||||
(define-syntax-rule (http/follow-redirects customization ... req-expr)
|
||||
(call-with-values (lambda () req-expr)
|
||||
(http-follow-redirects customization ...)))
|
||||
|
||||
(module+ test
|
||||
(require rackunit)
|
||||
|
||||
(http/simple-interpret-response
|
||||
(http/follow-redirects
|
||||
#"HEAD"
|
||||
(http-sendrecv/url (string->url "http://google.com/") #:method #"HEAD")))
|
||||
)
|
108
src/site.rkt
108
src/site.rkt
|
@ -11,11 +11,11 @@
|
|||
(require racket/date)
|
||||
(require racket/string)
|
||||
(require racket/port)
|
||||
(require (only-in racket/list filter-map))
|
||||
(require (only-in racket/list filter-map drop-right))
|
||||
(require (only-in racket/exn exn->string))
|
||||
(require net/url)
|
||||
(require (except-in net/url http-sendrecv/url))
|
||||
(require net/uri-codec)
|
||||
(require web-server/servlet)
|
||||
(require (except-in web-server/servlet http-sendrecv/url))
|
||||
(require json)
|
||||
(require "gravatar.rkt")
|
||||
(require "bootstrap.rkt")
|
||||
|
@ -29,6 +29,7 @@
|
|||
(require "hash-utils.rkt")
|
||||
(require "static.rkt")
|
||||
(require "package-source.rkt")
|
||||
(require "http-utils.rkt")
|
||||
|
||||
(define static-urlprefix
|
||||
(or (@ (config) static-urlprefix)
|
||||
|
@ -1417,32 +1418,85 @@
|
|||
(define default-version (package-default-version pkg))
|
||||
(define external-information
|
||||
(and pkg
|
||||
(if (equal? (@ default-version source)
|
||||
(@ default-version source_url))
|
||||
;; We don't know where to look for a readme.
|
||||
(hash)
|
||||
;; It's probably a github-like repo. Check for a readme.
|
||||
(let ((contents
|
||||
(with-handlers ([exn:fail:network?
|
||||
(lambda (e)
|
||||
(log-warning
|
||||
"Network error retrieving possible readme for ~a:\n~a"
|
||||
package-name
|
||||
(exn->string e))
|
||||
"")])
|
||||
(define the-port
|
||||
(get-pure-port (string->url (@ default-version source_url))
|
||||
#:redirections 10))
|
||||
(begin0 (port->string the-port)
|
||||
(close-input-port the-port)))))
|
||||
;;(log-info "CONTENTS: ~a === ~a" (@ default-version source_url) contents)
|
||||
(if (regexp-match? #px"(?i:id=.readme.)" contents)
|
||||
(let ((readme-url (string-append (@ default-version source_url) "#readme")))
|
||||
(log-info "Package ~a has a readme at ~a" package-name readme-url)
|
||||
(hash 'readme-url readme-url))
|
||||
(hash))))))
|
||||
(match (source->readme-url (@ default-version source))
|
||||
[#f (hash)]
|
||||
[readme-url
|
||||
(log-info "Package ~a has a readme at ~a" package-name readme-url)
|
||||
(hash 'readme-url readme-url)])))
|
||||
(set-package-external-information! package-name external-information))
|
||||
|
||||
;; String -> (Option String)
|
||||
;;
|
||||
;; Attempt to discover a "nearby" README for a given git "source URL",
|
||||
;; operating on the assumption that we have a vaguely github-like
|
||||
;; setup. We can do better here once we get a feel for what other
|
||||
;; possibilities exist out there, and how we can abstract over them.
|
||||
;;
|
||||
(define (source->readme-url s)
|
||||
|
||||
;; SYNTAX
|
||||
;; If exn:fail:network is raised, logs a warning and returns #f
|
||||
(define-syntax-rule (ignore-network-errors body ...)
|
||||
(with-handlers ([exn:fail:network?
|
||||
(lambda (e)
|
||||
(log-warning
|
||||
"Network error retrieving possible readme for source URL ~a:\n~a"
|
||||
s
|
||||
(exn->string e))
|
||||
#f)])
|
||||
body ...))
|
||||
|
||||
;; URL -> (Option String)
|
||||
;; Helper: Check for a "README.md" resource as a subresource of the
|
||||
;; given URL. Return the README's URL if it is found; otherwise #f.
|
||||
(define (extant-readme-md-urlstring u)
|
||||
(ignore-network-errors
|
||||
(define readme-u (struct-copy url u
|
||||
[path (append (url-path u) (list (path/param "README.md" '())))]))
|
||||
(log-info "Checking for readme at ~a ..." (url->string readme-u))
|
||||
(match/values (http/simple-interpret-response
|
||||
(http/follow-redirects
|
||||
#"HEAD"
|
||||
(http-sendrecv/url readme-u #:method #"HEAD")))
|
||||
[('success _headers _body) (url->string readme-u)]
|
||||
[(_ _ _) #f])))
|
||||
|
||||
;; URL -> (Option String)
|
||||
;; Helper: Retrieves the given resource and greps it for
|
||||
;; id="readme", more or less, to determine whether there's a usable
|
||||
;; fragment there.
|
||||
(define (horrible-readme-scraping-hack u)
|
||||
(ignore-network-errors
|
||||
(log-info "Checking for readme fragment at ~a ..." (url->string u))
|
||||
(match/values (http/simple-interpret-response
|
||||
(http/follow-redirects
|
||||
#"GET"
|
||||
(http-sendrecv/url u #:method #"GET")))
|
||||
[('success _headers body)
|
||||
(and (regexp-match? #px"(?i:id=.readme.)" body)
|
||||
(string-append (url->string u) "#readme"))]
|
||||
[(_ _ _) #f])))
|
||||
|
||||
(define-values (p _complaints) (parse-package-source s))
|
||||
(and (git-source? p)
|
||||
;; Search from the location given up into parent directories
|
||||
;; until we reach the repo root.
|
||||
(let* ((root-p (struct-copy git-source p [path ""]))
|
||||
(root-u (string->url (parsed-package-source-human-tree-url root-p)))
|
||||
(here-u (string->url (parsed-package-source-human-tree-url p))))
|
||||
(and (member (url-scheme here-u) (list "http" "https"))
|
||||
(let loop ((here-u here-u))
|
||||
;; Strategy: Try to directly retrieve "README.md"
|
||||
;; first. In principle, we could/should try other
|
||||
;; github-supported READMEish names here, but if this
|
||||
;; first check fails we go for a horrible
|
||||
;; content-scraping strategy instead.
|
||||
(or (extant-readme-md-urlstring here-u)
|
||||
(horrible-readme-scraping-hack here-u)
|
||||
(and (not (equal? here-u root-u))
|
||||
(loop (struct-copy url here-u
|
||||
[path (drop-right (url-path here-u) 1)])))))))))
|
||||
|
||||
(define (rerender! items-to-rerender)
|
||||
(thread-send (package-change-handler-thread) (list 'rerender! items-to-rerender)))
|
||||
|
||||
|
|
Loading…
Reference in New Issue
Block a user