file/[un]tar: add support for [un]packing without using the filesystem

Closes #2549
This commit is contained in:
Matthew Flatt 2021-05-06 18:28:40 -06:00
parent 486ab09587
commit b04b0fe3e1
6 changed files with 349 additions and 93 deletions

View File

@ -15,7 +15,7 @@ Symbolic links (on Unix and Mac OS) are not followed by default.}
@defproc[(tar [tar-file path-string?]
[path path-string?] ...
[path-or-entry (or/c path-string? tar-entry?)] ...
[#:follow-links? follow-links? any/c #f]
[#:exists-ok? exists-ok? any/c #f]
[#:format format (or/c 'pax 'gnu 'ustar) 'pax]
@ -30,10 +30,15 @@ Symbolic links (on Unix and Mac OS) are not followed by default.}
exact-nonnegative-integer?]{
Creates @racket[tar-file], which holds the complete content of all
@racket[path]s. The given @racket[path]s are all expected to be
@racket[path-or-entry]s. Each @racket[path-or-entry] is either a path
that refers to a file, directory, or link on the filesystem, or it is
a @racket[tar-entry] that describes such an entity without requiring
it to exist on the filesystem.
The given paths among @racket[path-or-entry]s are all expected to be
relative paths for existing directories and files (i.e., relative
to the current directory). If a nested path is provided as a
@racket[path], its ancestor directories are also added to the
to the current directory for a @racket[path-or-entry] is a path). If a nested path is provided in a
@racket[path-or-entry], its ancestor directories are also added to the
resulting tar file, up to the current directory (using
@racket[pathlist-closure]). If @racket[follow-links?] is false, then
symbolic links are included in the resulting tar file as links.
@ -62,10 +67,11 @@ date to record in the archive for each file or directory.
#:changed "6.7.0.4" @elem{Added the @racket[#:format] argument and
effectively changed its default from @racket['ustar]
to @racket['pax].}
#:changed "7.3.0.3" @elem{Added the @racket[#:timestamp] argument.}]}
#:changed "7.3.0.3" @elem{Added the @racket[#:timestamp] argument.}
#:changed "8.1.0.5" @elem{Added support for @racket[tar-entry] arguments.}]}
@defproc[(tar->output [paths (listof path?)]
@defproc[(tar->output [paths-and-entries (listof (or/c path? tar-entry?))]
[out output-port? (current-output-port)]
[#:follow-links? follow-links? any/c #f]
[#:format format (or/c 'pax 'gnu 'ustar) 'pax]
@ -79,9 +85,9 @@ date to record in the archive for each file or directory.
file-or-directory-modify-seconds)])
exact-nonnegative-integer?]{
Like @racket[tar], but packages each of the given @racket[paths] in a @exec{tar} format
Like @racket[tar], but packages each element of the given @racket[paths-and-entries] in a @exec{tar} format
archive that is written directly to the @racket[out]. The specified
@racket[paths] are included as-is (except for adding @racket[path-prefix], if any); if a directory is specified, its
@racket[paths-and-entries] are included as-is (except for adding @racket[path-prefix], if any); if a directory is specified, its
content is not automatically added, and nested directories are added
without parent directories.
@ -91,11 +97,12 @@ without parent directories.
#:changed "6.7.0.4" @elem{Added the @racket[#:format] argument and
effectively changed its default from @racket['ustar]
to @racket['pax].}
#:changed "7.3.0.3" @elem{Added the @racket[#:timestamp] argument.}]}
#:changed "7.3.0.3" @elem{Added the @racket[#:timestamp] argument.}
#:changed "8.1.0.5" @elem{Added support for @racket[tar-entry] arguments.}]}
@defproc[(tar-gzip [tar-file path-string?]
[paths path-string?] ...
[paths-and-entries (and/c path-string? tar-entry?)] ...
[#:follow-links? follow-links? any/c #f]
[#:exists-ok? exists-ok? any/c #f]
[#:format format (or/c 'pax 'gnu 'ustar) 'pax]
@ -116,4 +123,53 @@ Like @racket[tar], but compresses the resulting file with @racket[gzip].
#:changed "6.7.0.4" @elem{Added the @racket[#:format] argument and
effectively changed its default from @racket['ustar]
to @racket['pax].}
#:changed "7.3.0.3" @elem{Added the @racket[#:timestamp] argument.}]}
#:changed "7.3.0.3" @elem{Added the @racket[#:timestamp] argument.}
#:changed "8.1.0.5" @elem{Added support for @racket[tar-entry] arguments.}]}
@defstruct[tar-entry ([kind (or/c 'file 'directory 'link)]
[path (and/c path-string? relative-path?)]
[content (or/c input-port? (-> input-port?) #f path-string?)]
[size exact-nonnegative-integer?]
[attribs (hash/c symbol? any/c)])]{
Represents a file, directory, or link to be included in a USTAR file
or stream.
If @racket[kind] is @racket['file], then @racket[content] must be an
input port or a thunk that produces an input port, and it must provide
exactly @racket[size] bytes. If @racket[kind] is @racket['directory],
then @racket[content] and @racket[size] are expected to be @racket[#f]
and @racket[0]. If @racket[kind] is @racket['link], then
@racket[content] must be a path, and @racket[size] is expected to be
@racket[0].
The @racket[attribs] field contains a hash table providing additional
properties of the entry. The following keys are currently used when
writing a USTAR file or stream:
@itemlist[
@item{@racket['permissions] --- an integer representing read,
write, and execute permissions in the form accepted by
@racket[file-or-directory-permissions].}
@item{@racket['modify-seconds] --- an integer representing a
modification time, which is consistent with
@racket[file-or-directory-modify-seconds].}
@item{@racket['owner] --- an exact integer presenting a file
owner ID.}
@item{@racket['owner-bytes] --- a byte string representing a
file owner name.}
@item{@racket['group] --- an exact integer presenting a file
group ID.}
@item{@racket['group-bytes] --- a byte string representing a
file group name.}
]
@history[#:added "8.1.0.5"]}

View File

@ -1,5 +1,6 @@
#lang scribble/doc
@(require "common.rkt" (for-label file/untar))
@(require "common.rkt" (for-label file/untar
(only-in file/tar tar-entry)))
@title[#:tag "untar"]{@exec{tar} File Extraction}
@ -17,7 +18,15 @@ pax extensions to support long pathnames.}
exact-nonnegative-integer?
exact-nonnegative-integer?
. -> . any/c)
(lambda args #t)])
(lambda args #t)]
[#:handle-entry handle-entry
((or/c 'file 'directory 'link)
(and path? relative-path?)
(or/c input-port? #f path?)
exact-nonnegative-integer?
(hash/c symbol? any/c)
. -> . (listof (-> any)))
handle-tar-entry])
void?]{
Extracts TAR/USTAR content from @racket[in], recognizing
@ -72,7 +81,58 @@ For each item in the archive, @racket[filter-proc] is applied to
If the result of @racket[filter-proc] is @racket[#f], then the item is
not unpacked.
The @racket[handle-entry] function is called to unpack one entry, and
the default @racket[handle-tar-entry] function for
@racket[handle-entry] creates a directory, file, or link on the
filesystem. The @racket[handle-entry] function must accept five
arguments:
@;
@itemlist[
@item{@racket[_kind] --- one of @racket['file], @racket['directory],
or @racket['link].}
@item{@racket[_path] --- the relative path recorded in the TAR file.}
@item{@racket[_content] --- an input port that provides the content
for a @racket['file] entry, where exactly @racket[_size] bytes
must be read from the port before @racket[handle-entry]
returns. For a @racket['directory] entry, @racket[_content] is
@racket[#f]. For a @racket['link] entry, @racket[_content] is
a path for the link target.}
@item{@racket[_size] --- the number of bytes for a @racket['file]
entry, and @racket[0] for other entries.}
@item{@racket[_attribs] --- an immutable hash table mapping symbols
to attribute values. The available keys may change, but the
currently included keys are the same ones as recognized in
@racket[tar-entry].}
]
The result of @racket[handle-entry] is a list of thunks that are
called in order after the TAR input is fully unpacked. A result thunk
from @racket[handle-entry] is useful, for example, to set a
directory's modification time after all files have been written to it.
@history[#:changed "6.3" @elem{Added the @racket[#:permissive?] argument.}
#:changed "6.7.0.4" @elem{Support long paths and long symbolic-link
targets using POSIX.1-2001/pax and GNU
extensions.}]}
extensions.}
#:changed "8.1.0.5" @elem{Added the @racket[#:handle-entry] argument.}]}
@defproc[(handle-tar-entry [kind (or/c 'file 'directory 'link)]
[path (and path? relative-path?)]
[content (or/c input-port? #f path?)]
[size exact-nonnegative-integer?]
[attribs (hash/c symbol? any/c)])
(listof (-> any))]{
As the default entry handler for @racket[untar],
@racket[handle-tar-entry] creates directories and files and returns a
list of thunks that complete unpacking by setting directory
permissions and modification times.
@history[#:added "8.1.0.5"]}

View File

@ -93,10 +93,14 @@
"four")
(unless (eq? 'windows (system-type))
(define (sanitize bstr)
(if (eq? 'macosx (system-type)) ; paths are always UTF-8 encodings
(string->bytes/utf-8 (bytes->string/utf-8 bstr #\?))
bstr))
(check "encoding"
(bytes->path #"one\xF0")
(bytes->path (sanitize #"one\xF0"))
"two\u3BB"
(bytes->path (bytes-append #"sub/three\xF1-" (make-bytes 93 (char->integer #\x))))
(bytes->path (bytes-append (sanitize #"sub/three\xF1-") (make-bytes 93 (char->integer #\x))))
(string-append "sub/four\u3BB-" (make-string 93 #\x)))
(check "long link"

View File

@ -1,5 +1,6 @@
#lang racket/base
(require file/untar file/untgz file/unzip racket/file racket/system racket/set
(except-in file/tar tar)
tests/eli-tester)
(provide tests)
@ -70,6 +71,24 @@
(begin (file-or-directory-permissions* dest "rwx") #t))))]
[else #t]))
(define (tar->entries a.tar)
(define got '())
(untar a.tar
#:handle-entry (lambda (kind path content len attribs)
(set! got
(cons (tar-entry kind
path
(if (input-port? content)
(let ([bstr (read-bytes len content)])
(lambda ()
(open-input-bytes bstr)))
content)
len
attribs)
got))
null))
(reverse got))
(define (untar-tests*)
(make-directory* "ex1")
(make-file (build-path "ex1" "f1") (- (current-seconds) 12) "rw")
@ -80,6 +99,35 @@
(make-file (build-path more-dir "f4") (current-seconds) "rw")
(file-or-directory-permissions* more-dir "rx") ; not "w"
(tar "ex1" a.tar)
(let ([got1 (tar->entries a.tar)])
(define (check-find path)
(unless (for/or ([e (in-list got1)])
(equal? path (tar-entry-path e)))
(error "missing in entries" path)))
(check-find (build-path "ex1" "f1"))
(check-find (build-path "ex1" "f2"))
(check-find (build-path "ex1" "f3"))
(check-find (build-path "ex1" "f4"))
(check-find (build-path "ex1" "more" "f4"))
(define-values (i o) (make-pipe))
(tar->output got1 o)
(close-output-port o)
(let ([got2 (tar->entries i)])
(unless (= (length got1) (length got2))
(error "entries lists not the same length"))
(define (check what same?)
(unless same?
(error "entries differ at" what)))
(for ([e1 (in-list got1)]
[e2 (in-list got2)])
(check 'kind (eq? (tar-entry-kind e1)
(tar-entry-kind e2)))
(check 'path (equal? (tar-entry-path e1)
(tar-entry-path e2)))
(check 'len (equal? (tar-entry-size e1)
(tar-entry-size e2)))
(check 'attribs (equal? (tar-entry-attribs e1)
(tar-entry-attribs e2))))))
(make-directory* "sub")
(parameterize ([current-directory "sub"]) (untar a.tar))
(test (diff "ex1" (build-path "sub" "ex1") #t))

View File

@ -46,10 +46,22 @@
(define 0-byte (char->integer #\0))
(define ((tar-one-entry buf prefix get-timestamp follow-links? format) path)
(let* ([link? (and (not follow-links?) (link-exists? path))]
[dir? (and (not link?) (directory-exists? path))]
[size (if (or dir? link?) 0 (file-size path))]
(provide (struct-out tar-entry))
(struct tar-entry (kind path content size attribs))
(define ((tar-one-entry buf prefix get-timestamp follow-links? format) path-or-entry)
(define entry (and (tar-entry? path-or-entry) path-or-entry))
(define path (if entry (tar-entry-path entry) path-or-entry))
(let* ([link? (if entry
(eq? 'link (tar-entry-kind entry))
(and (not follow-links?) (link-exists? path)))]
[dir? (if entry
(eq? 'directory (tar-entry-kind entry))
(and (not link?) (directory-exists? path)))]
[size (if (or dir? link?) 0 (if entry
(tar-entry-size entry)
(file-size path)))]
[attribs (and entry (tar-entry-attribs entry))]
[p 0] ; write pointer
[cksum 0]
[cksum-p #f])
@ -88,26 +100,37 @@
(unless (zero? (modulo len tar-block-size))
(write-bytes buf (current-output-port) (modulo len tar-block-size))))
(define attrib-path
(if link?
;; For a link, use attributes of the containing directory:
(let-values ([(base name dir?) (split-path path)])
(or (and (path? base)
base)
(current-directory)))
path))
(and (not entry)
(if link?
;; For a link, use attributes of the containing directory:
(let-values ([(base name dir?) (split-path path)])
(or (and (path? base)
base)
(current-directory)))
path)))
(define link-path-bytes (and link?
(path->bytes (resolve-path path))))
(if entry
(path->bytes (tar-entry-content entry))
(path->bytes (resolve-path path)))))
;; see http://www.mkssoftware.com/docs/man4/tar.4.asp for format spec
(define (write-a-block file-name-bytes file-prefix size type link-path-bytes)
(set! p 0)
(set! cksum 0)
(set! cksum-p #f)
(write-block tar-name-length file-name-bytes)
(write-octal 8 (path-attributes attrib-path dir?))
(write-octal 8 0) ; always root (uid)
(write-octal 8 0) ; always root (gid)
(write-octal 8 (if attribs
(hash-ref attribs 'permissions #o644)
(path-attributes attrib-path dir?)))
(write-octal 8 (if attribs
(hash-ref attribs 'owner 0)
0))
(write-octal 8 (if attribs
(hash-ref attribs 'group 0)
0))
(write-octal 12 size)
(write-octal 12 (get-timestamp attrib-path))
(write-octal 12 (if attribs
(hash-ref attribs 'modify-seconds 0)
(get-timestamp attrib-path)))
;; set checksum later, consider it "all blanks" for cksum
(set! cksum-p p) (set! cksum (+ cksum (* 8 32))) (advance 8)
(write-block* 1 type) ; type-flag
@ -118,8 +141,12 @@
(advance tar-link-name-length)) ; no link-name
(write-block 6 #"ustar") ; magic
(write-block* 2 #"00") ; version
(write-block 32 #"root") ; always root (user-name)
(write-block 32 #"root") ; always root (group-name)
(write-block 32 (if attribs
(hash-ref attribs 'owner-bytes #"root")
#"root"))
(write-block 32 (if attribs
(hash-ref attribs 'group-bytes #"root")
#"root"))
(write-octal 8 0) ; device-major
(write-octal 8 0) ; device-minor
(write-block tar-prefix-length file-prefix)
@ -175,23 +202,29 @@
(if (or dir? link?)
(zero-block! buf) ; must clean buffer for re-use
;; write the file
(with-input-from-file path
(lambda ()
(let loop ([n size])
(let ([l (read-bytes! buf)])
(cond
[(eq? l tar-block-size) (write-bytes buf) (loop (- n l))]
[(number? l) ; shouldn't happen
(write-bytes buf (current-output-port) 0 l) (loop (- n l))]
[(not (eq? eof l)) (error 'tar "internal error")]
[(not (zero? n))
(error 'tar "file changed while packing: ~e" path)]
[else (zero-block! buf) ; must clean buffer for re-use
(let ([l (modulo size tar-block-size)])
(unless (zero? l)
;; complete block (buf is now zeroed)
(write-bytes buf (current-output-port)
0 (- tar-block-size l))))]))))))))
(let ([copy
(lambda (in)
(let loop ([n size])
(let ([l (read-bytes! buf in)])
(cond
[(eq? l tar-block-size) (write-bytes buf) (loop (- n l))]
[(number? l) ; shouldn't happen
(write-bytes buf (current-output-port) 0 l) (loop (- n l))]
[(not (eq? eof l)) (error 'tar "internal error")]
[(not (zero? n))
(error 'tar "file changed while packing: ~e" path)]
[else (zero-block! buf) ; must clean buffer for re-use
(let ([l (modulo size tar-block-size)])
(unless (zero? l)
;; complete block (buf is now zeroed)
(write-bytes buf (current-output-port)
0 (- tar-block-size l))))]))))])
(if entry
(let ([content (tar-entry-content entry)])
(if (input-port? content)
(copy content)
(copy (content))))
(call-with-input-file* path copy))))))
;; tar-write : (listof relative-path) ->
;; writes a tar file to current-output-port

View File

@ -15,8 +15,18 @@
#:filter (path? (or/c path? #f)
symbol? exact-integer? (or/c path? #f)
exact-nonnegative-integer? exact-nonnegative-integer?
. -> . any/c))
void?)]))
. -> . any/c)
#:handle-entry ((or/c 'file 'directory 'link)
(and path? relative-path?)
(or/c input-port? #f path?) exact-nonnegative-integer?
(hash/c symbol? any/c)
. -> . (listof (procedure-arity-includes/c 0))))
void?)]
[handle-tar-entry ((or/c 'file 'directory 'link)
(and path? relative-path?)
(or/c input-port? #f path?) exact-nonnegative-integer?
(hash/c symbol? any/c)
. -> . (listof (procedure-arity-includes/c 0)))]))
(define-logger untar)
@ -24,7 +34,8 @@
#:dest [dest #f]
#:strip-count [strip-count 0]
#:permissive? [permissive? #f]
#:filter [filter void])
#:filter [filter void]
#:handle-entry [handle-entry handle-tar-entry])
((if (input-port? in)
(lambda (in f) (f in))
call-with-input-file*)
@ -35,7 +46,7 @@
(if (for/and ([b (in-bytes bstr)]) (zero? b))
(for ([delay (in-list (reverse delays))])
(delay))
(loop (untar-one-from-port in delays
(loop (untar-one-from-port in handle-entry delays
dest strip-count filter
permissive?
#f
@ -48,7 +59,14 @@
(error 'untar "unexpected EOF"))
s)
(define (untar-one-from-port in delays
(define (trim-terminator bstr)
(let loop ([i 0])
(cond
[(= i (bytes-length bstr)) bstr]
[(zero? (bytes-ref bstr i)) (subbytes bstr 0 i)]
[else (loop (add1 i))])))
(define (untar-one-from-port in handle-entry delays
dest strip-count filter
permissive?
path-from-extended-attributes
@ -77,8 +95,8 @@
[else 'unknown]))
(define link-target-bytes (read-bytes* 100 in))
(define ustar? (bytes=? #"ustar\00000" (read-bytes* 8 in)))
(define owner-bytes (read-bytes* 32 in))
(define group-bytes (read-bytes* 32 in))
(define owner-bytes (trim-terminator (read-bytes* 32 in)))
(define group-bytes (trim-terminator (read-bytes* 32 in)))
(define device-major-bytes (read-bytes* 8 in))
(define device-minor-bytes (read-bytes* 8 in))
(define filename-prefix-bytes (read-bytes* 155 in))
@ -106,52 +124,44 @@
(define create?
(filter base-filename filename type size link-target mod-time mode))
(define total-len (* (ceiling (/ size 512)) 512))
(define attribs (hasheq 'permissions mode
'modify-seconds mod-time
'owner owner
'group group
'owner-bytes owner-bytes
'group-bytes group-bytes))
(define (accum delays new-delays) (append (reverse new-delays) delays))
(cond
[(and filename create?)
(case type
[(dir)
(log-untar-info "directory: ~a" filename)
(make-directory* filename)
(cons
;; delay directory meta-data updates until after any contained
;; files are written
(lambda ()
(try-file-op
(lambda ()
(file-or-directory-permissions* filename mode #t)))
(try-file-op
(lambda ()
(file-or-directory-modify-seconds* filename mod-time #t))))
delays)]
(accum delays
(handle-entry 'directory
filename
#f 0
attribs))]
[(file)
(log-untar-info "file: ~a" filename)
(define-values (base name dir?) (split-path filename))
(when (path? base) (make-directory* base))
(call-with-output-file*
filename
#:exists 'truncate
(lambda (out)
(copy-bytes size in out)))
(try-file-op
(lambda ()
(file-or-directory-permissions* filename mode #f)))
(try-file-op
(lambda ()
(file-or-directory-modify-seconds* filename mod-time #f)))
(copy-bytes (- total-len size) in #f)
delays]
(begin0
(accum delays
(handle-entry 'file
filename
in size ; expect `size` bytes read from `in`
attribs))
(copy-bytes (- total-len size) in #f))]
[(link)
(log-untar-info "link: ~a" filename)
(define-values (base name dir?) (split-path filename))
(when (path? base) (make-directory* base))
(when (file-exists? filename) (delete-file filename))
(make-file-or-directory-link link-target filename)
delays]
(accum delays
(handle-entry 'link
filename
link-target 0
attribs))]
[(extended-header-for-next)
;; pax record to support long namesand other attributes
(define extended-header (read-pax in total-len))
;; Recur to use given paths, if any:
(untar-one-from-port in delays
(untar-one-from-port in handle-entry delays
dest strip-count filter
permissive?
(or (let ([v (hash-ref extended-header 'path #f)])
@ -165,7 +175,7 @@
(define o (open-output-bytes))
(copy-bytes total-len in o)
;; Recur to use given path:
(untar-one-from-port in delays
(untar-one-from-port in handle-entry delays
dest strip-count filter
permissive?
(bytes->path (nul-terminated (get-output-bytes o)))
@ -175,7 +185,7 @@
(define o (open-output-bytes))
(copy-bytes total-len in o)
;; Recur to use given link target:
(untar-one-from-port in delays
(untar-one-from-port in handle-entry delays
dest strip-count filter
permissive?
path-from-extended-attributes
@ -188,6 +198,51 @@
(copy-bytes total-len in #f)
delays]))
(define (handle-tar-entry kind
filename
content size
attribs)
(define mode (hash-ref attribs 'permissions))
(define mod-time (hash-ref attribs 'modify-seconds))
(case kind
[(directory)
(make-directory* filename)
(list
;; delay directory meta-data updates until after any contained
;; files are written
(lambda ()
(try-file-op
(lambda ()
(file-or-directory-modify-seconds* filename mod-time #t)))
(try-file-op
(lambda ()
(file-or-directory-permissions* filename mode #t)))))]
[(file)
(define in content)
(define-values (base name dir?) (split-path filename))
(when (path? base) (make-directory* base))
(call-with-output-file*
filename
#:exists 'truncate
(lambda (out)
(copy-bytes size in out)))
(try-file-op
(lambda ()
(file-or-directory-permissions* filename mode #f)))
(try-file-op
(lambda ()
(file-or-directory-modify-seconds* filename mod-time #f)))
null]
[(link)
(define link-target content)
(define-values (base name dir?) (split-path filename))
(when (path? base) (make-directory* base))
(when (file-exists? filename) (delete-file filename))
(make-file-or-directory-link link-target filename)
null]
[else
null]))
(define (copy-bytes amt in out)
(let ([bstr (make-bytes (min amt 4096))])
(let loop ([amt amt])