diff --git a/collects/meta/build/build b/collects/meta/build/build new file mode 100755 index 0000000000..447de3e589 --- /dev/null +++ b/collects/meta/build/build @@ -0,0 +1,2094 @@ +#!/bin/sh + +## This is the build script which creates the pre-compiled directory. It is +## currently running from Eli's account on winooski, but it should be easy to +## configure to run anywhere. It updates the svn trees, so instead of running +## it straight from there it is better to copy it someplace else before running +## so we get clean copies. + +# if we're not using it already, switch to bash +if [ "${BASH_VERSION:-not_bash}" = "not_bash" ]; then exec bash "$0" "$@"; fi + +############################################################################### +### Configuration + +# verbose output? +verbose="yes" +# should we record an external log at $scriptlogfile? ("only" means only there) +scriptlog="yes" +# should we make binaries? +make_bins="ask_or_yes" +# should we do a repository update (and start with an empty iplt dir)? +make_repos="ask_or_yes" +# should we make the pdf docs directory? +make_pdf_docs="ask_or_yes" +# should we re-make the build directories? +make_builddirs="ask_or_yes" +# should we make the pre-install bundles? +make_bundles="ask_or_yes" +# should we make platform-specific installers? +make_installers="ask_or_yes" +# should we make stuff available on the web page? +# (for major distributions, it will be in html/NNN instead of html/) +make_web="ask_or_yes" +# should we run all test options? (multiple configurations) +run_all_tests="no" + +# people to notify when a build starts +buildnotifyemail="" + +# repository paths to use -- trunk, tags/..., or branches/... +init_svnpath_vars() { + # use this function to initialize these on remote builds too + svnpath="${PLTSVNPATH:-trunk}" + svnipath="${PLTSVNIPATH:-trunk}" +} +init_svnpath_vars + +# main machine that runs the whole build (the expected `$hostname' value) +workmachine="winooski" +# main directory on $workmachine (should be absolute) +maindir="/home/scheme" + +# machines for specific installer creations +dmgmachine="kauai" +nsismachine="pitcairn" + +# list of environment variables that should be carried over to ssh jobs +ssh_vars=(PLTSVNPATH PLTSVNIPATH) + +# Add stuff to be msetted later (when we have the `mset' function) +declare -a initial_msets machines +msets() { + local m; for m; do initial_msets[${#initial_msets[*]}]="$m"; done +} +# shorthand for mset to define a build target +defbuild() { + machines[${#machines[*]}]="$1" + msets "/machines/$1" "platform=$2"; shift 2; msets "$@" +} + +# Remote builds configuration, a table of /machines// +# entries, with misc fields set. Machines and platforms must be unique. The +# "D" first entry is for default field values (missing default makes the field +# required). Warning: an `eval "foo=\"bar\""' is used to assign values. +msets "/machines/D" "workdir=/var/tmp" "moveto=" "copytobak=" \ + "configure_args=" "LDFLAGS=" "ext_lib_paths=" "renice=" +# defbuild "ccs-solaris" "sparc-solaris" "moveto=/proj/scheme" \ +# "ext_lib_paths=/arch/unix/packages/openssl-0.9.7e" +defbuild "pitcairn" "i386-win32" \ + "workdir=f:" # no "/..." path (that can get interpreted as a flag) +# The LDFLAGS is a workaround for a bug in Fink, see +# http://wiki.finkproject.org/index.php/Fink:Packaging:Preparing_for_10.5#OpenGL_Bug +defbuild "kauai" "ppc-darwin" "configure_args=--enable-xonx" \ + "LDFLAGS=-dylib_file /System/Library/Frameworks/OpenGL.framework/Versions/A/Libraries/libGL.dylib:/System/Library/Frameworks/OpenGL.framework/Versions/A/Libraries/libGL.dylib" +defbuild "weatherwax" "ppc-osx-mac" \ + "configure_args=--enable-sdk=/Developer/SDKs/MacOSX10.4u.sdk" +defbuild "macintel" "i386-osx-mac" \ + "configure_args=--enable-sdk=/Developer/SDKs/MacOSX10.4u.sdk" +# defbuild "galaga" "i386-linux-ubuntu-hardy" +defbuild "champlain" "i386-linux-f12" +defbuild "ccs-linux" "i386-linux-ubuntu-jaunty" "moveto=/proj/scheme" +# defbuild "punge" "i386-linux-ubuntu-jaunty" "renice=20" +# defbuild "bjorn" "i386-linux-gcc2" +# defbuild "chicago" "i386-linux-debian" +defbuild "brownbuild" "i386-linux-debian" # really an AMD64 machine +# defbuild "inga" "i386-freebsd" +# defbuild "chicago-unstable" "i386-linux-debian-unstable" +# Start the main build last +defbuild "$workmachine" "x86_64-linux-f7" "copytobak=$maindir" +msets "/" + +############################################################################### +### Initialize & Setup environment + +ulimit -c 100000000 +umask 002 # stuff that is created should be r/w by the group + +# get this script's name and path +cd "`dirname \"$0\"`" +buildscript="`pwd`/`basename \"$0\"`" +# get the current hostname (short version) +hostname="`hostname`" +hostname="${hostname%%.*}" + +# svn repository url +svnroot="http://svn.plt-scheme.org" + +# web directory for pre-prelease stuff on $workmachine (relative to $maindir) +prewebdir="html" +# directory for installation (relative to $maindir) +installdir="plt" +# directory for internal stuff (relative to $maindir) +scriptdir="iplt" +# directories for clean repository checkouts (relative to $maindir) +cleandir="checkout" +cleanscriptdir="icheckout" +# directory for binaries (relative to $maindir) +bindir="binaries" +# directory for pre-installers (relative to $maindir) +preinstdir="pre-installers" +# directory for installers (relative to $maindir) +installersdir="installers" +# directory for pre-installers (relative to $maindir) +instdir="installers" +# directory for docs (relative to $maindir) +docdir="docs" +# directory for web content (relative to $maindir) +webdir="web" +# script for patching files with current version info +versionpatcher="$maindir/$scriptdir/build/versionpatch" +# DrScheme test script +drtestscript="$maindir/$scriptdir/build/test-drscheme.ss" +# bundling script +bundlescript="$maindir/$scriptdir/build/bundle" +# web build script +webscript="$maindir/$scriptdir/web/build.ss" +# html patching script +htmlpatchscript="$maindir/$scriptdir/build/patch-html" +# sitemap materials +sitemapdir="$maindir/$scriptdir/build/sitemap" + +# platform-installer stuff, directories and files are all absolute +nsisdir="$maindir/$scriptdir/build/nsis" +unixinstallerdir="$maindir/$scriptdir/build/unix-installer" +unixpathcheckscript="$unixinstallerdir/check-install-paths" +unixinstallerscript="$unixinstallerdir/plt-installer-header" + +# full clean tgz before building anything (relative to $maindir) +cleantgz="${installdir}-clean-tree.tgz" +# full plt/src tgz (relative to $maindir) +srctgz="$installdir-src.tgz" +# log file for this script (relative to $maindir) +scriptlogfile="build-log.txt" +# name of html files to generate for web directories +index="index.html" +# timestamp and version file for automatic scripts (relative to $maindir) +stampfile="stamp" + +# directory for temporary stuff (absolute path) -- on all machines +tmpdir="/tmp" +# lockfile for this script +lockfile="/tmp/plt-build-lock" +# name for running this script remotely +remotebuildscript="$tmpdir/build-plt" +# full name for clean repository tgz file to transfer for distributed builds +repostgz="$tmpdir/$cleantgz" +# full name for full tgz file (with binaries etc) +fulltgz="$tmpdir/$installdir-full.tgz" +# log file name prefix for background jobs +bglogfile="$tmpdir/plt-bg-log" + +last_part() { + echo "$*" | sed 's/.*[ -]//' +} +last_part_capital() { + local word="`last_part \"$@\"`" + echo "`echo \"${word:0:1}\" | tr \"[:lower:]\" \"[:upper:]\"`${word:1}" +} + +# simple name associations +name_of_platform() { + case "$1" in + ( "i386-linux" ) echo "Linux/GCC3" ;; + ( "i386-linux-gcc2" ) echo "Linux/GCC2" ;; + ( "i386-linux-fc2" ) echo "Linux/Fedora Core 2" ;; + ( "i386-linux-fc5" ) echo "Linux/Fedora Core 5" ;; + ( "i386-linux-fc6" ) echo "Linux/Fedora Core 6" ;; + ( "i386-linux-f7" ) echo "Linux/Fedora 7/i386" ;; + ( "x86_64-linux-f7" ) echo "Linux/Fedora 7/x86_64" ;; + ( "i386-linux-f9" ) echo "Linux/Fedora 9/i386" ;; + ( "i386-linux-f12" ) echo "Linux/Fedora 12/i386" ;; + ( "i386-linux-debian" ) echo "Linux/Debian-stable" ;; + ( "i386-linux-debian-testing" ) echo "Linux/Debian-testing" ;; + ( "i386-linux-debian-unstable" ) echo "Linux/Debian-unstable" ;; + ( "i386-linux-ubuntu" ) echo "Linux/Ubuntu" ;; + ( "i386-linux-ubuntu-"* ) echo "Linux/Ubuntu `last_part_capital \"$1\"`" ;; + ( "i386-freebsd" ) echo "FreeBSD" ;; + ( "sparc-solaris" ) echo "Solaris" ;; + ( "i386-osx-mac" ) echo "Mac OS X (Intel)" ;; + ( "ppc-osx-mac" ) echo "Mac OS X (PPC)" ;; + ( "ppc-darwin" ) echo "Mac X11 on Darwin (PPC)" ;; + ( "i386-darwin" ) echo "Mac X11 on Darwin (Intel)" ;; + ( "i386-win32" ) echo "Windows" ;; + # These are source distribution platforms + ( "unix" ) echo "Unix" ;; + ( "mac" ) echo "Macintosh" ;; + ( "win" ) echo "Windows" ;; + ( * ) exit_error "Unknown platform name for name_of_platform \"$1\"" ;; + esac +} +extra_description_of_platform() { + local e="" + case "$1" in + ( "i386-linux" ) e="Binaries for GCC3 (eg, RedHat 9 and Fedora Core)." ;; + ( "i386-linux-gcc2" ) e="Binaries for old GCC2 setups (eg, RedHat 7.x)." ;; + ( "i386-linux-fc2" ) e="A Linux build on Fedora Core 2." ;; + ( "i386-linux-fc5" ) e="A Linux build on Fedora Core 5." ;; + ( "i386-linux-fc6" ) e="A Linux build on Fedora Core 6." ;; + ( "i386-linux-f7" ) e="A Linux build on Fedora 7 (i386)." ;; + ( "x86_64-linux-f7" ) e="A Linux build on Fedora 7 (x86_64)." ;; + ( "i386-linux-f9" ) e="A Linux build on Fedora 9 (i386)." ;; + ( "i386-linux-f12" ) e="A Linux build on Fedora 12 (i386)." ;; + ( "i386-linux-debian" ) e="A Linux build on Debian Stable." ;; + ( "i386-linux-debian-testing" ) e="A Linux build on Debian Testing." ;; + ( "i386-linux-debian-unstable" ) e="A Linux build on Debian Unstable." ;; + ( "i386-linux-ubuntu" ) e="A Linux build on Ubuntu." ;; + ( "i386-linux-ubuntu-"* ) + e="A Linux build on Ubuntu (`last_part_capital \"$1\"`)." ;; + ( *"-osx-mac" ) e="An OS X Build." ;; + ( *"-darwin" ) e="This is an X11 on Darwin build using"; + e="$e--enable-xonx, not a standard OS X build." ;; + esac + if [[ "$e" != "" ]]; then echo "
${e}"; fi +} +name_of_dist_package() { + case "$1" in + ( "mz" ) echo "MzScheme" ;; + ( "plt" ) echo "PLT Scheme" ;; + ( "full" ) echo "PLT Scheme Full" ;; + ( * ) exit_error "Unknown package name for name_of_dist_package: \"$1\"" ;; + esac +} +name_of_dist_type() { + case "$1" in + ( "bin" ) echo "Binary" ;; + ( "src" ) echo "Source" ;; + ( * ) exit_error "Unknown type name for name_of_dist_type: \"$1\"" ;; + esac +} +platforms_of_dist_type() { + case "$1" in + ( "bin" ) echo "i386-win32" \ + "i386-osx-mac" \ + "ppc-osx-mac" \ + "ppc-darwin" \ + "i386-darwin" \ + "i386-linux" \ + "i386-linux-gcc2" \ + "i386-linux-fc2" \ + "i386-linux-fc5" \ + "i386-linux-fc6" \ + "i386-linux-f7" \ + "x86_64-linux-f7" \ + "i386-linux-f9" \ + "i386-linux-f12" \ + "i386-linux-debian" \ + "i386-linux-debian-testing" \ + "i386-linux-debian-unstable" \ + "i386-linux-ubuntu" \ + "i386-linux-ubuntu-dapper" \ + "i386-linux-ubuntu-edgy" \ + "i386-linux-ubuntu-feisty" \ + "i386-linux-ubuntu-hardy" \ + "i386-linux-ubuntu-intrepid" \ + "i386-linux-ubuntu-jaunty" \ + "i386-freebsd" \ + "sparc-solaris" ;; + ( "src" ) echo "win mac unix" ;; + ( * ) exit_error "Unknown type name for platforms_of_dist_type: \"$1\"" ;; + esac +} +installer_of_dist_type_platform() { # input: dtype-dplatform + case "$1" in + ( "src-unix" ) echo "tgz" ;; + ( "src-mac" ) echo "dmg" ;; + ( "src-win" ) echo "zip" ;; + ( "bin-"*"-linux"* ) echo "sh" ;; + ( "bin-"*"-freebsd" ) echo "sh" ;; + ( "bin-"*"-solaris" ) echo "sh" ;; + ( "bin-"*"-darwin" ) echo "sh" ;; + ( "bin-"*"-osx-mac" ) echo "idmg" ;; + ( "bin-"*"-win32" ) echo "exe" ;; + ( * ) exit_error "Unknown dist type+platform for" \ + "installer_of_dist_type_platform: \"$1\"" ;; + esac +} +explanation_of_installer_type() { + case "$1" in + ( "tgz" ) echo "Unpack this file using" \ + "\"gunzip | tar xvf -\"." ;; + ( "dmg" ) echo "Mount this disk image and copy the PLT folder to your" \ + "disk." ;; + ( "idmg" ) echo "Some browsers will automatically mount & copy the" \ + "\"PLT Scheme\" folder to your desktop; if yours" \ + "does not, mount the disk and copy it yourself." ;; + ( "zip" ) echo "Use unzip to extract the PLT folder to your disk." ;; + ( "sh" ) echo "Execute this file with \"sh \"," \ + "and follow the instructions." ;; + ( "exe" ) echo "This is a standard Windows installer." ;; + ( * ) exit_error "Unknown installer type for" \ + "explanation_of_installer_type: \"$1\"." ;; + esac +} + +# This is for running mzscheme scripts, unrelated to the build itself +export PLTHOME="$maindir/$installdir" \ + PLT_EXTENSION_LIB_PATHS="" \ + PLTPLANETDIR="/tmp/plt-build-planet" +export PATH="$PLTHOME/bin:$PATH" +unset PLTCOLLECTS; export PLTCOLLECTS + +# useful for tests etc +export PLT_BUILD="yes" + +# setup for gui tests (and outside of them, there will not be a :65 +# display, so trying any gui will fail) +real_DISPLAY="$DISPLAY" +export DISPLAY=":65" +if [[ "$XAUTHORITY" = "" ]]; then export XAUTHORITY="$HOME/.Xauthority"; fi + +############################################################################### +### Utilities + +no_exit_on_error="no" +exit_error() { + echo "" + echo "<<>> (Working on ${machine}(${platform}))" 1>&2 + echo "$@" 1>&2 + if [[ "$no_exit_on_error" = "yes" ]]; then + echo "" + else + echo "Aborting" 1>&2 + exit 1 + fi +} +dont_exit() { + no_exit_on_error="yes" ; "$@" ; no_exit_on_error="no" +} + +cleanup_lockfile() { + rm -f "$lockfile" +} + +# Utilities for multi-level variables that can be used as sort of an +# associative arrays, with names that are treated similarly to paths and a +# default context similar to the current directory. (Implemented as plain +# variables, using "__" as the translation of "/" level separators.) +shopt -s extglob # needed for some hacks below +mcontext="/" # the current context for m-ops +mset() { + # mset goes over all args, which can have the following shapes: + # ...=... sets a variable in the current context + # /.../... sets the current absolute context + # .../... sets the current relative context + local m mvar val + for m; do + case "$m" in + ( *=* ) mvar="${m%%=*}" val="${m#*=}" + normalize_mvar; obfuscate_mvar + eval "${mvar}=\"${val}\"" + ;; + ( */* ) mvar="$m"; normalize_mvar; mcontext="$mvar" ;; + ( * ) exit_error "unknown name in mset: $m" ;; + esac + done +} +mget() { + # mget crawls over all args, and for each one retreives the mvar into a plain + # variable. The full form of an arg is "tgt=mvar?def" for a specified target + # var (default is the mvar's basename), and a default. The default can start + # with `@' to make it another mvar reference + local m mvar tgt def nodef=" <<>> " + for m; do + mvar=""; tgt=""; def="$nodef" + if [[ "$m" = *=* ]]; then tgt="${m%%=*}"; m="${m#*=}"; fi + if [[ "$m" = *"?"* ]]; then def="${m#*[?]}"; m="${m%%[?]*}"; fi + mvar="$m"; normalize_mvar + if [[ "$tgt" = "" ]]; then tgt="${mvar##*/}"; fi + obfuscate_mvar + if [[ "$def" = "$nodef" ]]; then + eval "${tgt}=\"\${${mvar}?${m} is not set}\"" + else + local R="$nodef" + eval "R=\"\${${mvar}:-\"$R\"}\"" + if [[ "$R" != "$nodef" ]]; then eval "${tgt}=\"${R}\"" + elif [[ "$def" = "@"* ]]; then mget "${tgt}=${def#@}" + else eval "${tgt}=\"${def}\"" + fi + fi + done +} +machineget() { + # an mget-like version for machines, using the default fields (and a global + # $machine value) + local m tgt + for m; do + if [[ "$m" = *=* ]]; then tgt="${m%%=*}="; m="${m#*=}"; else tgt=""; fi + mget "${tgt}/machines/${machine}/${m}?@/machines/D/${m}" + done +} +# Utility for the above: normalize `mvar' (mvar and mcontext are globals) +normalize_mvar() { + # absolute mvar => don't use the mcontext + if [[ ! "$mvar" = "/"* ]]; then mvar="/${mcontext}/${mvar}"; fi + mvar="${mvar}/" # add "/" suffix for the processing below + mvar="${mvar//\/+(\/)//}" # "//" -> "/" + mvar="${mvar//\/.\///}" # "/./" -> "/" + mvar="${mvar//\/+([^\/])\/..\///}" # eliminate ".." + mvar="${mvar/#\/+(..\/)//}" # eliminate prefix ".." + mvar="${mvar%/}" # remove "/" suffix +} +obfuscate_mvar() { + mvar="${mvar//\//__}" + mvar="${mvar//-/_}" +} +# now that we have these functions, do the initial_msets +mset "${initial_msets[@]}" +# global build-context variables, and set main-machine values +machine="$workmachine" +machineget platform workdir + +# portable `echo -n' +if [[ "`echo -n`" = "-n" ]]; then + echo_n() { echo ${1+"$@"}"\c"; } +else + echo_n() { echo -n ${1+"$@"}; } +fi + +show() { + if [[ "$verbose" = "yes" ]]; then + echo "" + case "$platform" in + ( *"-linux"* | "sparc-solaris" | "i386-win32" ) + echo ">>>" "$@" | fmt -t -w 79 + ;; + ( *"-freebsd" | *"-osx-mac" | *"-darwin" ) + echo ">>>" "$@" | fmt -w 79 + ;; + ( * ) + echo ">>>" "$@" | fmt + ;; + esac + fi +} + +# a yes/no question mode for some vars, possibly set a constant answer +ask_mode="no" +fixed_reply="" +is_yes() { + local var="$1"; shift + local val; eval val="\$$var" + local reply + if [[ "$val" = "yes" ]]; then return 0 + elif [[ "$val" = "no" ]]; then return 1 + elif [[ "$val" = "ask_or_yes" ]]; then + if [[ "$ask_mode" = "yes" ]]; then + echo "" 1>&2 + echo "" 1>&2 + while true; do + echo_n ">>> QUESTION >>> $var [y/n/Y/N] ? " 1>&2 + if [[ "$fixed_reply" != "" ]]; then reply="$fixed_reply" + else read -sn 1 reply; fi + echo "$reply" 1>&2 + case "$reply" in + ( Y ) fixed_reply="y"; reply="y" ;; + ( N ) fixed_reply="n"; reply="n" ;; + esac + case "$reply" in + ( y ) eval $var="yes"; return 0 ;; + ( n ) eval $var="no"; return 1 ;; + ( * ) reply="" ;; + esac + done + else + eval $var="yes"; return 0 + fi + else + exit_error "bad value for flag '$var': '$val'" + fi +} + +lookfor() { + save_IFS="${IFS}" + IFS="${IFS}:" + for dir in $PATH; do + if test -x "$dir/$1"; then + IFS="$save_IFS" + echo_n "$dir/$1" + return + fi + done + IFS="$save_IFS" +} + +_run() { + show "Running \"$*\"" + "$@" \ + || exit_error "Errors when running \"$*\"" +} + +# there is a common sh hack for getting the Nth word from a command: +# "... `set \`blah\`; echo $1` ..." +# the problem with this is if blah produces no output -- which will end up +# dumping out the complete environment -- so use this instead +__get_first_arg() { printf '%s' "$1"; } +__get_first_output() { __get_first_arg `cat`; } +# inputs: command to run +get_first() { "$@" | __get_first_output; } + +_cd() { + local OLDWD="`pwd`" + cd "$1" || exit_error "Could not cd into \"$1\"" + local NEWWD="`pwd`" + if [[ "$NEWWD" != "$OLDWD" ]]; then + show "Now in \"`pwd`\"" + fi +} + +_md() { + for x; do + if [[ ! -d "$x" ]]; then + show "Creating directory \"$x\"" + mkdir -p "$1" || exit_error "Could create directory \"$x\"" + fi + done +} + +_mcd() { + _md "$1"; _cd "$1" +} + +_rm() { + for x; do + if [[ -h "$x" ]]; then + show "Deleting link \"$x\"" + rm -f "$x" || exit_error "The \"$x\" link cannot be deleted" + elif [[ -d "$x" ]]; then + show "Deleting directory \"$x\"" + rm -rf "$x" || exit_error "The \"$x\" directory cannot be deleted" + elif [[ -e "$x" ]]; then + show "Deleting \"$x\"" + rm -rf "$x" || exit_error "\"$x\" cannot be deleted" + fi + done +} + +_rmd() { + _rm "$1"; _md "$1" +} + +_rmcd() { + _rm "$1"; _mcd "$1" +} + +_mv() { + show "Moving \"$*\"" + mv "$@" || exit_error "Could not move \"$*\"" +} + +_cat() { + show "Showing \"$@\"" + cat "$@" || exit_error "Could not show \"$@\"" +} + +_cp() { + show "Copying: \"$*\"" + cp -p "$@" || exit_error "Could not copy \"$*\"" +} + +_scp() { + show "Copying: \"$*\"" + scp -p "$@" || exit_error "Could not copy \"$*\"" +} + +_ln() { + show "SymLinking \"$2\" -> \"$1\"" + ln -s "$1" "$2" || exit_error "Could not symlink \"$2\"->\"$1\"" +} + +_zip() { + local zip_file="$1"; shift + show "Zipping \"$*\" to \"$zip_file\" in \"`pwd`\"" + zip -qr9 "$zip_file" "$@" \ + || exit_error "Could not zip \"$*\" to \"$zip_file\" in \"`pwd`\"" +} + +# try to use gtar if we can find it +TAR="`lookfor gtar`" +if [[ "$TAR" = "" ]]; then TAR="`lookfor tar`"; fi + +_tar() { + local tar_file="$1"; shift + show "Tarring \"$*\" to \"$tar_file\" in \"`pwd`\"" + "$TAR" cf "$tar_file" "$@" \ + || exit_error "Could not tar \"$*\" to \"$tar_file\" in \"`pwd`\"" +} + +_tgzip() { + local tgz_file="$1"; shift + show "Packing \"$*\" to \"$tgz_file\" in \"`pwd`\"" + "$TAR" czf "$tgz_file" "$@" \ + || exit_error "Could not pack \"$*\" to \"$tgz_file\" in \"`pwd`\"" +} + +_tar_add() { + local tar_file="$1"; shift + show "Adding \"$*\" to \"$tar_file\" in \"`pwd`\"" + "$TAR" uf "$tar_file" "$@" \ + || exit_error "Could not add \"$*\" to \"$tar_file\" in \"`pwd`\"" +} + +_tgunzip() { + show "Unpacking \"$1\" in \"`pwd`\"" + "$TAR" xzf "$1" || exit_error "Could not unpack \"$1\" in \"`pwd`\"" +} + +_tgunzipm() { + show "Unpacking \"$1\" in \"`pwd`\"" + "$TAR" xzmf "$1" || exit_error "Could not unpack \"$1\" in \"`pwd`\"" +} + +_strip() { + local f + for f; do + if [[ -e "$f" ]]; then + show "Stripping \"$f\"" + strip -S "$f" || exit_error "Could not strip \"$f\"" + fi + done +} + +svn_get() { # inputs: svn repository, svn path, path in $maindir + local repos="$1" path="$2" dir="$3"; shift 3 + show "Getting $repos/$path to $maindir/$dir" + _cd "$maindir" + if [[ ! -d "$dir" ]]; then + _run svn checkout --depth immediates "$svnroot/$repos" "$dir" + fi + _cd "$dir" + _run svn update --set-depth infinity "$path" + svn status "$path" > "$tmpdir/svn-st" \ + || exit_error "problems running svn status" + if [[ -s "$tmpdir/svn-st" ]]; then + cat "$tmpdir/svn-st" 1>&2 + rm -f "$tmpdir/svn-st" + exit_error "The working directory is not clean (see above)" + fi + rm -f "$tmpdir/svn-st" + _cd "$maindir" +} + +append_dots() { # inputs: width, string + local line="............................................................" + echo "${2}${line:0:$(( ${1} - ${#2} ))}" +} + +separator() { + local line="============================================================" + local sep="$*" + local sep_len=${#sep} + local idx1=$(( ( 77 - $sep_len ) / 2 )) + local idx2=$(( ( 78 - $sep_len ) / 2 )) + local line1=${line:0:$(( ( $idx1 < 3 ) ? 3 : $idx1 ))} + local line2=${line:0:$(( ( $idx2 < 3 ) ? 3 : $idx2 ))} + local dashes="`echo \"$line1 $sep $line2\" | sed 's/./-/g'`" + echo "" + echo "" + echo "$dashes" + echo "$line1 $sep $line2" + echo "$dashes" + echo "" +} + +build_step() { # inputs: name, command + local jobname="$1"; shift + separator "Building: $jobname [${machine}(${platform})]" + show "Running \"$*\"" + start_timer + "$@" || exit_error "\"$jobname\" part of build process failed" + show_time "--==> $jobname on ${machine}(${platform}) done," +} + +cur_secs() { + date '+%s' +} +start_timer() { + timer_start=`cur_secs` +} +show_time() { + local time=$(( `cur_secs` - $timer_start )) + local secs=$(( $time % 60 )) + local mins=$(( $time / 60 )) + show "$1 time: `printf '%d:%02d' $mins $secs`" +} + +choose_for_testing() { # input: test_mode, options ... + # choose items from the given inputs, either all, the first, or a random one + local mode="$1"; shift + case "$mode" in + ( all ) echo "$*" ;; + ( def ) echo "$1" ;; + ( rnd ) mode=$(( $RANDOM % $# + 1 )); echo "${!mode}" ;; + ( * ) exit_error "bad value in choose_for_testing: $mode" + esac +} + +# Utilities for GUI tests (and process management) + +_kill() { # args: pid [process name] + local pid="$1"; shift + local desc="$pid" + if [[ "$1" != "" ]]; then desc="$1 ($pid)"; shift; fi + if [[ ! -d "/proc/$pid" ]]; then return; fi + show "Killing $desc" + kill -15 "$pid" > /dev/null 2>& 1; if [[ ! -d "/proc/$pid" ]]; then return; fi + usleep 500000 ; if [[ ! -d "/proc/$pid" ]]; then return; fi + usleep 500000 ; if [[ ! -d "/proc/$pid" ]]; then return; fi + sleep 1 ; if [[ ! -d "/proc/$pid" ]]; then return; fi + echo "re-killing $desc" + kill -15 "$pid" > /dev/null 2>& 1; if [[ ! -d "/proc/$pid" ]]; then return; fi + sleep 2 ; if [[ ! -d "/proc/$pid" ]]; then return; fi + echo "re-re-killing $desc" + kill -15 "$pid" > /dev/null 2>& 1; if [[ ! -d "/proc/$pid" ]]; then return; fi + sleep 2 ; if [[ ! -d "/proc/$pid" ]]; then return; fi + echo "murdering $desc" + kill -9 "$pid" > /dev/null 2>& 1; if [[ ! -d "/proc/$pid" ]]; then return; fi + sleep 2 ; if [[ ! -d "/proc/$pid" ]]; then return; fi + echo "re-murdering $desc" + kill -9 "$pid" > /dev/null 2>& 1; if [[ ! -d "/proc/$pid" ]]; then return; fi + sleep 2 ; if [[ ! -d "/proc/$pid" ]]; then return; fi + echo "re-re-murdering $desc" + kill -9 "$pid" > /dev/null 2>& 1; if [[ ! -d "/proc/$pid" ]]; then return; fi + sleep 2 ; if [[ ! -d "/proc/$pid" ]]; then return; fi + echo "BOOM Zombie alert: $desc did not die" +} + +_timeout_run() { # first input is the timeout + local timeout="$1"; shift + local exe="$1" + show "Running \"$*\" with a timeout of $timeout" + "$@" & + local pid="$!" + local result="99" + ( # sleep in background so we're still interruptible + local sleeper="$$" + alldone() { kill -15 "$sleeper"; exit; } + trap alldone 0 3 9 15 + sleep "$timeout" & + sleeper="$!" + wait "$sleeper" + _kill "$pid" "$exe [timeout]" + ) & + local killerpid="$!" + wait "$pid"; result="$?" + _kill "$killerpid" + if [[ "$result" != "0" ]]; then + exit_error "Errors when running \"$*\" (with a timeout)" + fi + return "$result" +} + +Xvncpid="" +Xwmpid="" +_start_xvnc() { + local xvnclog="$tmpdir/plt-xvnc-log" + show "Starting Xvnc (logfile at \"$xvnclog\")" + # Create Xauth cookie + cookie="`mcookie`" + xauth -f "$XAUTHORITY" add "`uname -n`$DISPLAY" . "$cookie" + xauth -f "$XAUTHORITY" add "`uname -n`/unix$DISPLAY" . "$cookie" + # Create Xvnc session, with a WM + Xvnc "$DISPLAY" \ + -rfbport 6565 \ + -localhost \ + -desktop "PLT-Session" \ + -geometry 1024x768 \ + -depth 16 \ + -httpPort=0 \ + -auth "$XAUTHORITY" \ + -rfbauth "$HOME/.vnc/passwd" \ + -br \ + > "$xvnclog" 2>&1 & + Xvncpid="$!"; usleep 500000 + echo "Xvnc running ($Xvncpid)" + metacity --sm-disable & + Xwmpid="$!"; usleep 500000 + echo "window manager running ($Xwmpid)" + # to see the window, uncomment this + # DISPLAY="$real_DISPLAY" vncviewer ::6565 -PasswordFile "$HOME/.vnc/passwd" & +} +_end_xvnc() { + show "Killing Xvnc session" + if [[ "$Xvncpid" = "" ]]; then show "Xvnc was not started"; return 1; fi + _kill "$Xwmpid" "window manager" + _kill "$Xvncpid" "Xvnc" + Xvncpid=""; Xwmpid="" +} + +parse_c_define() { # input: filename, varname + local file="$1" varname="$2"; shift 2 + grep "^ *# *define * $varname * " "$file" \ + | sed -e 's/^ *# *define * [^ ]* * //' -e 's/ * $//' +} + +version_init() { # input: plthome + local vfile="$1/src/mzscheme/src/schvers.h" + [[ -e "$vfile" ]] \ + || exit_error "Could not find version file at \"$vfile\"" + # parse version info + version="`parse_c_define \"$vfile\" MZSCHEME_VERSION | sed -e 's/\"//g'`" + version1="`parse_c_define \"$vfile\" MZSCHEME_VERSION_X`" + version2="`parse_c_define \"$vfile\" MZSCHEME_VERSION_Y`" + version3="`parse_c_define \"$vfile\" MZSCHEME_VERSION_Z`" + version4="`parse_c_define \"$vfile\" MZSCHEME_VERSION_W`" + # consistency check + local VER="$version1.$version2" + if [[ "$version4" != "0" ]]; then VER="$VER.$version3.$version4" + elif [[ "$version3" != "0" ]]; then VER="$VER.$version3" + fi + [[ "$version" = "$VER" ]] \ + || exit_error "Mismatch in \"$vfile\": $version vs $VER" + # release is when the last one is zero + if [[ "$version4" = "0" ]]; then + separator "This is a release version ($version)" + releasing="yes" + reallyreleasing="yes" + elif [[ "$svnpath" = "release" ]]; then + separator "This is a pre-release version ($version)" + releasing="yes" + reallyreleasing="no" + else + separator "This is a non-release version ($version)" + releasing="no" + reallyreleasing="no" + fi +} + +# html functions -- all write to $htmloutput +# ($htmloutput is usually $index; also, assume that $htmloutput is in the +# current directory -- be careful when cd-ing!) +# stuff before html_content_begin and after html_content_end is temporary, +# later on, patch-html will combine the contents with the skeleton files. +html_begin() { # inputs: title [output-name] + local htmltitle="$1"; shift + htmloutput="$index" + if [[ "$1" != "" ]]; then htmloutput="$1"; shift; fi + show "Creating \"`pwd`/$htmloutput\" for \"$htmltitle\"" + _rm "$htmloutput" + { echo "" + echo "$htmltitle" + echo "" + echo "" + echo " " + echo " " + echo "
" + echo " " + echo " " + echo "
" + echo " $htmltitle

" + while [[ "$#" -gt "0" ]]; do + if [[ "$1" = "-f" ]]; then shift; cat "$1"; else echo "$1"; fi + shift + done + } > "$htmloutput" +} +html_content_begin() { + echo '' >> "$htmloutput" +} +html_table_begin() { # inputs: [rules-attr] + local rules="rows" + if [[ "$1" != "" ]]; then rows="$1"; fi + { echo "
" + echo "" + } >> "$htmloutput" +} +html_show() { # inputs: or <-f file> ... + { while [[ "$#" -gt "0" ]]; do + if [[ "$1" = "-f" ]]; then shift; cat "$1"; else echo "$1"; fi + shift + done + } >> "$htmloutput" +} +html_file_row() { # inputs: filename, explanation ... + local fname="$1"; shift + { echo_n "" + echo "" + } >> "$htmloutput" +} +html_table_end() { + echo "
• " + echo_n "$fname" + if [[ -f "$fname" ]]; then + echo_n " (`get_first du -h \"$fname\"`)" + fi + echo " $*
" >> "$htmloutput" +} +html_content_end() { + echo '' >> "$htmloutput" +} +html_end() { + { echo "
" + echo '' + echo "(version $version, $htmltimestamp)" + echo '' + echo "
" + echo "" + } >> "$htmloutput" + show "Finished \"`pwd`/$htmloutput\"" +} + +run_part() { + local exec=no + local bg=no + while true; do + case "$1" in + ( -exec ) exec="yes"; shift; continue ;; + ( -bg ) bg="yes"; shift; continue ;; + ( * ) break ;; + esac + done + local runhost="$1" runpart="$2"; shift 2 + # echo "runhost=$runhost, exec=$exec, bg=$bg, $*" + if [[ "$runhost" = "$hostname" ]]; then + if [[ "$bg" = "yes" ]]; then "$runpart" "$@" & + # must check $exec before running -- since if this is done in bg, then + # other calls to this function will overwrite it! + elif [[ "$exec" = "yes" ]]; then "$runpart" "$@"; exit $? + else "$runpart" "$@" + fi + else + # ssh does not preserve proper arguments, so this does not work with + # arguments that contain spaces. + local rbuild="$remotebuildscript" + _scp "$buildscript" "${runhost}:$remotebuildscript" + local ssh_vars_vals i var val + i=0 + while [[ "$i" -lt ${#ssh_vars[*]} ]]; do + var="${ssh_vars[i]}" + eval "val=\"\${$var}\"" + ssh_vars_vals[$i]="${var}=${val}" + i=$((i+1)) + done + local cmd + cmd="--dispatch ${ssh_vars_vals[@]} $runhost $runpart" + if [[ "$exec" = "yes" ]]; then + exec ssh "$runhost" "$rbuild" $cmd "$@" \ + || exit_error "Errors running \"$rbuild\" on \"$runhost\"" + exit_error "Something is wrong with \"exec\"" + elif [[ "$bg" = "yes" ]]; then + ssh "$runhost" "$rbuild" $cmd "$@" & + else + ssh "$runhost" "$rbuild" $cmd "$@" \ + || exit_error "Errors running \"$rbuild\" on \"$runhost\"" + fi + fi +} + + +############################################################################### +### Build Parts + +MAIN() { + # switch to build machine, if invoked remotely + run_part -exec "$workmachine" MAIN_BUILD "$@" +} + +## ============================================================================ + +MAIN_BUILD() { + + ## -------------------------------------------------------------------------- + # use a lock file, no retries, and recreate it if it's over 3 hours old + _run lockfile -r 0 -l 10800 "$lockfile" + trap cleanup_lockfile 0 3 9 15 + + ## -------------------------------------------------------------------------- + separator "Begin (`date`)" + + timestamp="`date '+%Y%m%d%H%M'`" + htmltimestamp="`date '+updated at %A, %B %d %Y, %H:%M %Z'`" + if [[ "$1" = "ask" ]]; then ask_mode="yes"; shift; fi + + ## -------------------------------------------------------------------------- + if is_yes make_repos; then + separator "Repository updates" + svn_get "plt" "$svnpath" "$cleandir" + svn_get "iplt" "$svnipath" "$cleanscriptdir" + else + show "Skipping repository updates" + fi + + version_init "$maindir/$cleandir/$svnpath" + if is_yes make_repos; then + DO_AUTO_UPDATES + fi + + if is_yes make_repos; then + _cd "$maindir" + _rm "$scriptdir" + _cp -r "$cleanscriptdir/$svnipath" "$scriptdir" + fi + + if is_yes make_bins; then + _rm "$repostgz" + _cd "$maindir/$cleandir/$svnpath" + _tgzip "$repostgz" --exclude=".svn" * + _cd "$maindir" + fi + + # send build notification message + if [[ "$buildnotifyemail" != "" && "$CRON" != "yes" ]]; then + show "Sending notifications" + echo "Build starting at `date`" \ + | mail -s "A build is starting..." "$buildnotifyemail" + fi + + ## -------------------------------------------------------------------------- + separator "Dispatching build jobs" + + local m + if is_yes make_bins; then + for m in "${machines[@]}"; do DO_COPY_BUILD "$m"; done + else + show "Skipping binaries" + fi + + # build pdfs while other machines continue doing their builds + BUILD_DOCS_AND_PDFS + + # and now wait for all builds + if is_yes make_bins; then + show "Waiting for remote jobs to finish" + wait + for m in "${machines[@]}"; do + machine="$m" + machineget mplatform=platform + if [[ "$machine" != "$workmachine" ]]; then + separator "{{{ Doing ${machine}(${mplatform}) remotely }}}" + _cat "$bglogfile-$machine" + _rm "$bglogfile-$machine" + fi + done + fi + + if is_yes make_builddirs; then COPY_AND_BUILD_BINARY_DIRS + else show "Skipping copying and dirs"; fi + + if is_yes make_bundles; then BUILD_BUNDLES + else show "Skipping bundles"; fi + + if is_yes make_installers; then BUILD_INSTALLERS + else show "Skipping installers"; fi + + if is_yes make_web; then BUILD_WEB; fi + + _rm "$lockfile" + + separator "Done (`date`)" + +} + +## ============================================================================ + +DO_AUTO_UPDATES() { + + ## -------------------------------------------------------------------------- + separator "Updating repository files" + + show "Updating stamp file" + _cd "$maindir/$cleandir/$svnpath" + local stamp="collects/repos-time-stamp/stamp.ss" + _rm "$stamp" + show "Creating $stamp" + { echo_n '#lang scheme/base (provide stamp) (define stamp "' + echo_n "`date +'%e%b%Y' | tr -d ' ' | tr 'A-Z' 'a-z'`" + echo '")' + } > "$stamp" + + show "Updating version numbers" + # if the racket executable is not there, we'll fail, but that + # shouldn't be a problem since it will run again next time + if [[ -x "$PLTHOME/bin/racket" ]]; then + dont_exit _run "$versionpatcher" "$version" + else + show "Skipping version update (no racket executable)" + fi + + show "Committing changes (if any)" + _run svn commit -m "Welcome to a new PLT day." . + +} + +## ============================================================================ + +DO_COPY_BUILD() { # inputs -- machine-name (for ssh) + + ## -------------------------------------------------------------------------- + machine="$1"; shift + + if [[ "$machine" != "$workmachine" ]]; then + show "Running DO_BUILD on $machine in the background" + _scp "$repostgz" "${machine}:$repostgz" + _rm "$bglogfile-$machine" + run_part -bg "$machine" "DO_BUILD" "$releasing" "$@" \ + &> "$bglogfile-$machine" + else + separator "{{{ Doing ${machine}(${platform}) locally }}}" + run_part "$machine" "DO_BUILD" "$releasing" "$@" + fi + +} + +## ============================================================================ + +DO_BUILD() { # inputs -- releasing + + ## -------------------------------------------------------------------------- + releasing="$1"; shift + machineget platform workdir moveto copytobak \ + configure_args ext_lib_paths renice + + if [[ "$renice" != "" ]]; then dont_exit _run renice "$renice" "$$"; fi + + export PLTHOME="$workdir/$installdir" PATH="$PLTHOME/bin:$PATH" + export SETUP_ARGS="-l- setup -U" + + # make sure we don't use any planet caches (PLTPLANETDIR is set globally) + _rm "$PLTPLANETDIR" + + if [[ "$releasing" = "yes" ]]; then + # don't do this for the nightly builds -- if they fail and a previous tgz + # is there, we'll end up using it + _rm "$fulltgz" + fi + + if [[ ext_lib_paths != "" ]]; then + export PLT_EXTENSION_LIB_PATHS="${ext_lib_paths}:$PLT_EXTENSION_LIB_PATHS" + fi + + if [[ "$machine" != "$workmachine" ]]; then + _rmcd "$PLTHOME" + _tgunzipm "$repostgz" + export SETUP_ARGS="$SETUP_ARGS -D" + else + # on the main machine, copy the repository to keep meta .svn information + _cd "$workdir" + _rm "$PLTHOME" + _cp -r "$maindir/$cleandir/$svnpath" "$PLTHOME" + # and then create pre-build archives + show "Creating pre-build archives" + _tgzip "$maindir/$cleantgz" --exclude=".svn" "$installdir" + _tgzip "$maindir/$srctgz" --exclude=".svn" "$installdir/src" + fi + + ## -------------------------------------------------------------------------- + if [[ "$platform" = "i386-win32" ]]; then + export PLTPLANETDIR="`cygpath -w \"$PLTPLANETDIR\"`" + DO_WIN32_BUILD + else + _mcd "$PLTHOME/src/build" + machineget LDFLAGS; export LDFLAGS + build_step "configure" ../configure ${configure_args} + build_step "make both" make both + build_step "make install" make plain-install-both + build_step "setup-plt" "$PLTHOME/bin/racket" $SETUP_ARGS + fi + + ## -------------------------------------------------------------------------- + separator "${machine}(${platform}): Stripping binaries" + + # Strip binaries + _cd "$PLTHOME" + case "$platform" in + ( *"-linux"* | *"-freebsd" | "sparc-solaris" | *"-darwin" ) + _strip "bin/racket"{,3m,cgc} "bin/gracket"{,3m,cgc} + ;; + ( *"-osx-mac" ) + _strip "bin/racket"{,3m,cgc} "GRacket"*".app/Contents/MacOS/GRacket"* \ + "lib"/{,G}"Racket.framework"/"Versions"/*/{,G}"Racket" + ;; + ( *"-win32" ) + # (just don't include *.pdb and *ilk) + show "Nothing to strip for \"$platform\"" + ;; + ( * ) + exit_error "don't know if binaries for $platform should be stripped" + ;; + esac + + ## -------------------------------------------------------------------------- + separator "${machine}(${platform}): Creating \"$fulltgz\"" + + _rm "$fulltgz" + _cd "$workdir" + # excluding x/y does not work on solaris, so rename it instead + _mv "$PLTHOME/src" "$PLTHOME/___src___" + _tgzip "$fulltgz" --exclude="___src___" --exclude=".svn" \ + --exclude="*.[Pp][Dd][Bb]" --exclude="*.[Ii][Ll][Kk]" \ + "$installdir" + _mv "$PLTHOME/___src___" "$PLTHOME/src" + + ## -------------------------------------------------------------------------- + # choose a test mode (def/rnd/all) + local test_mode="def" + if [[ "$run_all_tests" = "yes" ]]; then test_mode="all"; + elif [[ "$releasing" = "yes" ]]; then test_mode="all"; + elif [[ "$(( $RANDOM % 2 ))" = "0" ]]; then test_mode="rnd"; + fi; + separator "${machine}(${platform}) testing Racket ($test_mode)" + local testdir="$tmpdir/mztests" + _rmcd "$testdir" + + local _exe _jit exe flags + for _exe in `choose_for_testing $test_mode 3m cgc`; do + for _jit in `choose_for_testing $test_mode yes no`; do + if [[ "${_exe}" = "cgc" ]]; then exe="cgc"; else exe=""; fi + if [[ "$platform" = "i386-win32" ]]; then + exe="$PLTHOME/Racket$exe.exe" + else + exe="$PLTHOME/bin/racket$exe" + fi + flags="" + if [[ "${_jit}" = "no" ]]; then flags="--no-jit $flags"; fi + dont_exit _run env HOME="$testdir" DISPLAY="" \ + "$exe" $flags "$PLTHOME/collects/tests/run-automated-tests.ss" + done + done + sleep 8 # time to flush stderr + + # MrEd-based tests on the main machine, in an Xvnc session + if [[ "$machine" = "$workmachine" ]]; then + separator "${machine}(${platform}) running Mred/DrScheme tests" + _start_xvnc + dont_exit _timeout_run 60 env HOME="$testdir" "$drtestscript" + dont_exit _timeout_run 300 env HOME="$testdir" \ + "$PLTHOME/collects/tests/framework/framework-test" + _end_xvnc + fi + + ## -------------------------------------------------------------------------- + # move to the target at the end of the build, only if building from trunk + local targetdir="" mode="" op="" + if [[ "$svnpath" != "trunk" ]]; then : + elif [[ "$copytobak" != "" ]]; then + targetdir="$copytobak"; mode="bk"; op="Backing up" + elif [[ "$moveto" != "" ]]; then + targetdir="$moveto"; mode="mv"; op="Moving" + fi + if [[ "$targetdir" != "" ]]; then + separator "${machine}(${platform}): $op installation to \"$targetdir\"" + _md "$targetdir/$installdir-new" + _cd "$workdir/$installdir" + show "Copying \"$PLTHOME\" to \"$targetdir/$installdir-new\"" + "$TAR" cf - . | ( cd "$targetdir/$installdir-new"; "$TAR" xf - ) \ + || exit_error \ + "Could not copy \"$PLTHOME\" to \"$targetdir/$installdir-new\"" + _cd "$targetdir" + if [[ "$mode" = "mv" ]]; then + # move the installation, trying to delete the previous one if possible + # do it this way in case there is already a leftover "$installdir-old" + _md "$installdir-old" + _mv "$installdir" "$installdir-old/old-`date '+%Y%m%d%H%M'`-$$" + _mv "$installdir-new" "$installdir" + _rm "$PLTHOME" + show "Removing \"$targetdir/$installdir-old\"" + # this is done this way in case there is an old process using a binary + # which will not allow removing the directory, but we don't care about + # that. + dont_exit _rm "$targetdir/$installdir-old" + else + # copy the installation to a backup directory, leaving one + # backup of the old backup tree if it was there (this is used on + # the build machine, so there's an updated copy of the tree at + # ~scheme/plt); the main work directory is kept the same. + if [[ -e "$installdir-backup" ]]; then _rm "$installdir-backup"; fi + if [[ -e "$installdir" ]]; then _mv "$installdir" "$installdir-backup"; fi + _mv "$installdir-new" "$installdir" + fi + fi + + ## -------------------------------------------------------------------------- + separator "${machine}(${platform}) done" + +} + +## ============================================================================ + +winpath2unix() { # input: windows path + echo "$*" | sed 's_^\([a-zA-Z]\):[/\\]_/\1/_; s_\\_/_g' +} + +build_w32step() { # inputs: type, name, [args...] + separator "Building: $2 [${machine}(${platform})] ($1)" + local btype="$1" bname="$2"; shift 2 + start_timer + case "$btype" in + ( "VSNET" ) _cd "$PLTHOME/src/worksp/$bname" + _run "$VSNET" "$bname.sln" /build "Release|Win32" + ;; + ( "NMAKE" ) _run "$NMAKE" "$@" + ;; + ( "MZCGC" ) _run "$PLTHOME/RacketCGC.exe" "$@" + ;; + ( "MZ" ) # prefer using no-suffix, then 3m, and then cgc + # (needed because cgc is used to build 3m) + local E="$PLTHOME/Racket" + if [[ -x "${E}.exe" ]]; then _run "${E}.exe" "$@" + elif [[ -x "${E}3m.exe" ]]; then _run "${E}3m.exe" "$@" + elif [[ -x "${E}CGC.exe" ]]; then _run "${E}CGC.exe" "$@" + else exit_error "No Racket executable found" + fi + ;; + ( "VSNET3M" ) _cd "$PLTHOME/src/worksp/$bname" + _run "$VSNET" "$bname.sln" /build "Release|Win32" + _run "$PLTHOME/Racket.exe" "xform.ss" "$@" + _run "$VSNET" "$bname.sln" /build "3m|Win32" + ;; + ( * ) exit_error "Unknown type for build_w32step: \"$btype\"" ;; + esac + show_time "--==> $bname on ${machine}(${platform}) done," +} + +DO_WIN32_BUILD() { + + ## -------------------------------------------------------------------------- + /usr/bin/mount -c / + + export TEMP="c:\\cygwin\\tmp" TMP="c:\\cygwin\\tmp" + + # Note: commands must be executed using unix paths (also PATH) + STUDIO="c:\\Program Files\\Microsoft Visual Studio 8" + SCOMMON="$STUDIO\\Common7" + VC="$STUDIO\\VC" + VSNET="`winpath2unix \"$SCOMMON\\IDE\\devenv.com\"`" + NMAKE="`winpath2unix \"$VC\\bin\\nmake.exe\"`" + local uSCOMMON="`winpath2unix \"$SCOMMON\"`" + local uVC="`winpath2unix \"$VC\"`" + local uPLTHOME="`winpath2unix \"$PLTHOME\"`" + PATH="$uVC/bin:$uSCOMMON/IDE:$uSCOMMON/Tools:$uSCOMMON/Tools/Bin" + PATH="$PATH:/usr/local/bin:/usr/bin:/bin" + PATH="$PATH:/c/Windows/system32:/c/Windows:/c/Windows/System32/Wbem" + PATH="$PATH:$uPLTHOME:$uPLTHOME/bin" + PATH="$PATH:." + + INCLUDE="$VC\\include;$VC\\atlmfc\\include;$VC\PlatformSDK\Include" + LIB=".;$VC\\lib;$VC\\atlmfc\\lib;$VC\\PlatformSDK\\lib" + export VSNET NMAKE PATH INCLUDE LIB + + # separator "win32: Convert .sln files" + # local SAVED_IFS="$IFS"; IFS=$'\n' + # local sln + # for sln in `find "$PLTHOME/src/worksp" -type f -name "*.sln"`; do + # _cd "`dirname \"$sln\"`" + # _run "$VSNET" /upgrade "`basename \"$sln\"`" + # done + # IFS="$SAVED_IFS" + + separator "win32: Full build" + build_w32step VSNET "mzscheme" + build_w32step VSNET "mred" + _cd "$PLTHOME/src/worksp/gc2"; build_w32step MZ "3M" make.ss + + _cd "$PLTHOME" + build_w32step VSNET "mzstart" + build_w32step VSNET "mrstart" + + separator "win32: Building libraries" + _cd "$PLTHOME"; build_w32step MZ "mzc" -l- setup -Dl compiler + + build_w32step VSNET3M "mzcom" + build_w32step VSNET3M "libmysterx" + # _cd "$PLTHOME/src/srpersist" + # build_w32step NMAKE "srpersist" /f srpersist.mak "install" + + _cd "$PLTHOME"; build_w32step MZ "setup-plt" $SETUP_ARGS + + separator "win32: Building Cygwin libreries" + _mcd "$PLTHOME/src/build" + _run ../configure --disable-mred + _cd "mzscheme/dynsrc" + show "Running \"make\" for Cygwin" + make && make cygwin-install \ + || exit_error "Errors when running \"make\" for Cygwin" + + # Borland is no longer supported: + # separator "win32: Building Borland libreries" + # _cd "$PLTHOME/src/mzscheme/dynsrc" + # _run bcc32 -I"../include" -I"g:/borland/bcc55/include" \ + # -o"mzdynb.obj" -c "mzdyn.c" + # _md "$PLTHOME/lib/bcc" + # _cp "mzdynb.obj" "mzdynb.def" "$PLTHOME/lib/bcc" + + _cd "$PLTHOME" + build_w32step MZ "winvers" -l setup/winvers; sleep 240 + +} + +## ============================================================================ + +BUILD_DOCS_AND_PDFS() { + + separator "Copying and making \"$docdir\"" + + _rmcd "$maindir/$docdir" + html_begin "Documentation" + html_content_begin + html_table_begin + { + html_file_row "html" \ + "html files for on-line browsing (same as plt/collecs/doc)" + _rm "html" + _cp -r "$workdir/$installdir/doc" "html" + } + if is_yes make_pdf_docs; then + html_file_row "pdf" "pdf versions of the manuals" + _rmcd "pdf" + # avoid any work except for the pdf generation + _run "$PLTHOME/bin/setup-plt" \ + --no-zo --no-launcher --no-install --no-post-install \ + --no-info-domain --no-docs --no-user --no-planet \ + --doc-pdf . + _cd .. + else + show "Skipping pdf build" + fi + html_table_end + html_content_end + html_end + +} + +## ============================================================================ + +COPY_AND_BUILD_BINARY_DIRS() { + + ## -------------------------------------------------------------------------- + # This creates build-related directories. The installers and + # pre-installers are built in their own steps. + + ## -------------------------------------------------------------------------- + separator "Copying and making \"$bindir\"" + + _rmcd "$maindir/$bindir" + + html_begin "Binaries" + html_content_begin + html_show "Note that the binaries include the CGC versions." + html_table_begin + + local m + for m in "${machines[@]}"; do + machine="$m" machineget mplatform=platform + mplatformname="`name_of_platform \"$mplatform\"`" + html_file_row "$mplatform" "Binaries for $mplatformname" + { + _rmcd "$mplatform" + local btgz="$installdir-$mplatform-binaries.tgz" + local ftgz="$installdir-$mplatform-full.tgz" + local prfx="" + if [[ "$m" != "$workmachine" ]]; then prfx="${m}:"; fi + _scp "${prfx}$fulltgz" "$ftgz" + local extratext="`extra_description_of_platform \"$mplatform\"`" + html_begin "$mplatformname binaries ($mplatform)" + html_content_begin + html_show "These are the $mplatformname binary files." $extratext + html_table_begin + # The following two things do not exist until the bundle script runs + html_file_row "$installdir" "The binary files part of the build tree" + html_file_row "$btgz" "An archive of the above" + html_file_row "$ftgz" "An archive of the fully-built tree" \ + "
(without the \"src\" tree)" + html_table_end + html_content_end + html_end + _cd .. + } + done + + html_table_end + html_content_end + html_end + + ## -------------------------------------------------------------------------- + separator "Making \"$stampfile\"" + + _cd "$maindir" + _rm "$stampfile" + echo "$timestamp $version" > "$stampfile" + +} + +## ============================================================================ + +BUILD_BUNDLES() { + + ## -------------------------------------------------------------------------- + # the index in this directory is made by BUILD_INSTALLERS below + + separator "Creating pre-installer bundles" + _rmd "$maindir/$preinstdir" + show "Running the bundle script" + local bundleflags="" + if [[ "$releasing" = "yes" ]]; then bundleflags="$bundleflags ++release"; fi + _run "$PLTHOME/bin/racket" \ + "$bundlescript" -o "$maindir/$preinstdir" $bundleflags + +} + +## ============================================================================ + +# platform-specific installer makers: +# $1 is input file, $2 is the output (without suffix) +# $3 is the package name (mz/plt), $4 is the type (bin/src) +# $5 is the platform name (unix/mac/win for src distributions) + +#---------------------------------------- +tgz_to_tgz() { + if [[ "$4" != "src" ]]; then + _cp "$1" "$2.tgz" + else + local savedpwd="`pwd`" + local srcdir="$3-$version" + _rmcd "$tmpdir/tgz-to-tgz-$$" + _tgunzip "$1" + _mv "$installdir" "$srcdir" + _tgzip "$2.tgz" "$srcdir" + _cd "$savedpwd" + _rm "$tmpdir/tgz-to-tgz-$$" + fi +} +#---------------------------------------- +tgz_to_sh() { + local srctgz="$1" tgtsh="$2.sh" pname="$3"; shift 3 + local tmppackdir="$tmpdir/pack-$$" + local tmptgz="$tmpdir/pack-$$.tar.gz" + local treesize installerlines archivecksum + # check paths data in configure script + if [[ "$unixpathcheckscript" != "DONE" ]]; then + show "Checking paths in configure script" + _run "$unixpathcheckscript" + unixpathcheckscript="DONE" + fi + savedpwd="`pwd`" + _rmcd "$tmppackdir" + _tgunzip "$srctgz" + _run sudo chown -R root:root "$tmppackdir" + _run sudo chmod -R g+w "$tmppackdir" + _cd "$installdir" + _run pax -w -z -f "$tmptgz" * + treesize="`get_first du -hs .`" + _cd "$savedpwd" + # change back so we can remove it + _run sudo chown -R "`id -nu`:`id -ng`" "$tmppackdir" + _rm "$tmppackdir" + archivecksum="`get_first cksum \"$tmptgz\"`" + local humanname="`name_of_dist_package \"$pname\"` v$version" + local tgtname="$pname" + if [[ "$releasing" != "yes" ]]; then tgtname="$tgtname-$version"; fi + echo "Writing \"$tgtsh\"" + { echo "#!/bin/sh" + echo "" + echo "# This is a self-extracting shell script for $humanname." + echo "# To use it, just run it, or run \"sh\" with it as an argument." + echo "" + echo "DISTNAME=\"$humanname\"" + echo "PNAME=\"$pname\"" + echo "TARGET=\"$tgtname\"" + echo "BINSUM=\"$archivecksum\"" + echo "ORIGSIZE=\"$treesize\"" + echo "RELEASED=\"$releasing\"" + } > "$tgtsh" \ + || exit_error "Could not write \"$tgtsh\"" + installerlines=$(( `get_first wc -l "$unixinstallerscript"` + + `get_first wc -l "$tgtsh"` + + 2 )) + echo "BINSTARTLINE=\"$installerlines\"" >> "$tgtsh" + cat "$unixinstallerscript" >> "$tgtsh" + cat "$tmptgz" >> "$tgtsh" + chmod +x "$tgtsh" + rm "$tmptgz" +} +#---------------------------------------- +tgz_to_zip() { + local savedpwd="`pwd`" + local srcdir="$installdir" + _rmcd "$tmpdir/tgz-to-zip-$$" + _tgunzip "$1" + if [[ "$4" = "src" ]]; then + srcdir="$3-$version" + _mv "$installdir" "$srcdir" + fi + _zip "$2.zip" "$srcdir" + _cd "$savedpwd" + _rm "$tmpdir/tgz-to-zip-$$" +} +#---------------------------------------- +make_dmg() { # inputs: dir, dmg, internet-enabled? + local srcdir="$1" tgtdmg="$2" internet_enabled="$3"; shift 3 + local tmpdmg="${tgtdmg%.dmg}-tmp.dmg" + local src="`basename \"$srcdir\"`" + local myself="`id -nu`:`id -ng`" + show "Making \"$tgtdmg\" from \"$srcdir\"" + _cd "`dirname \"$srcdir\"`" + _run sudo rm -f "$tgtdmg" "$tmpdmg" + # It should be possible to create dmgs normally, but they'd be created with + # the same user id of whoever runs this script... + _run sudo chown -R root:admin "$src" + # The following command should work fine, but it looks like hdiutil in 10.4 + # is miscalculating the needed size, making it too big in our case (and too + # small with >8GB images). It seems that it works to first generate an + # uncompressed image and then convert it to a compressed one. + # _run sudo hdiutil create -format UDZO -imagekey zlib-level=9 -ov \ + # -mode 555 -volname "$src" -srcfolder "$src" "$tgtdmg" + # so: [1] create an uncompressed image + _run sudo hdiutil create -format UDRW -ov \ + -mode 555 -volname "$src" -srcfolder "$src" "$tmpdmg" + # [2] remove the source tree + _run sudo rm -rf "$src" + # [3] create the compressed image from the uncompressed image + _run sudo hdiutil convert -format UDZO -imagekey zlib-level=9 -ov \ + "$tmpdmg" -o "$tgtdmg" + # [4] remove the uncompressed image + _run sudo chown "$myself" "$tgtdmg" "$tmpdmg" + _rm "$tmpdmg" + # this will make browsers get the dmg, mount, copy contents, unmount + if [[ "$internet_enabled" = "yes" ]]; then + _run hdiutil internet-enable "$tgtdmg" + fi +} +#---------------------------------------- +do_tgz_to_dmg() { + local internet_enabled="$1" tmptgz="$2" tmpdmg="$3" version="$4" + local packagename="$5" packagetype="$6" + shift 6 + local distname="`name_of_dist_package \"$packagename\"`" + distname="$distname v$version" + if [[ "$packagetype" != "bin" ]]; then + distname="$distname `name_of_dist_type \"$packagetype\"`" + fi + local savedpwd="`pwd`" + _rm "$tmpdmg" + _rmcd "$tmpdir/tgz-to-dmg-$$" + _mcd "$distname" + _tgunzip "$tmptgz" + _rm "$tmptgz" + _mv "$installdir" "$distname" + _cd "$tmpdir/tgz-to-dmg-$$" + make_dmg "$distname" "$tmpdmg" "$internet_enabled" + _cd "$savedpwd" + _rm "$tmpdir/tgz-to-dmg-$$" +} +tgz_to_some_dmg() { + local internet_enabled="$1" srctgz="$2" tgtdmg="$3.dmg"; shift 3 + local tmptgz="$tmpdir/tgz2dmg.tgz" + local tmpdmg="$tmpdir/tgz2dmg.dmg" + _scp "$srctgz" "${dmgmachine}:$tmptgz" + run_part "$dmgmachine" "do_tgz_to_dmg" "$internet_enabled" \ + "$tmptgz" "$tmpdmg" "$version" "$@" + _scp "${dmgmachine}:$tmpdmg" "$tgtdmg" +} +tgz_to_dmg() { + tgz_to_some_dmg "no" "$@" +} +tgz_to_idmg() { # same as ..._dmg, but makes it internet-enabled + tgz_to_some_dmg "yes" "$@" +} +#---------------------------------------- +do_tgz_to_exe() { + local tmptgz="$1" tmpexe="$2" nsistgz="$3" packagename="$4" packagetype="$5" + shift 5 + local savedpwd="`pwd`" + _rmcd "$tmpdir/tgz-to-exe-$$" + _tgunzip "$nsistgz" + _tgunzip "$tmptgz" + show "Running NSIS to create the installer" + "/c/Program Files/NSIS/makensis.exe" /V3 "plt-installer.nsi" | tr -d '\r' \ + || exit_error "NSIS build failed" + _mv "installer.exe" "$tmpexe" + _cd "$savedpwd" + _rm "$tmpdir/tgz-to-exe-$$" +} +tgz_to_exe() { + local srctgz="$1" tgtexe="$2.exe" pname="$3"; shift 3 + local nsistgz="$tmpdir/plt-nsis.tgz" + local tmptgz="$tmpdir/tgz2exe.tgz" + local tmpexe="$tmpdir/tgz2exe.exe" + _rm "$tmpdir/plt-nsis-$$" + _cp -r "$nsisdir" "$tmpdir/plt-nsis-$$" + _cd "$tmpdir/plt-nsis-$$" + show "Writing \"plt-defs.nsh\"" + { local def='!define' + echo "$def PLTVersion \"$version\"" + # this must be four numbers + echo "$def PLTVersionLong \"$version1.$version2.$version3.$version4\"" + echo "$def PLTHumanName \"`name_of_dist_package \"$pname\"` v$version\"" + if [[ "$releasing" != "yes" ]]; then + echo "$def PLTStartName \"`name_of_dist_package \"$pname\"` v$version\"" + else + echo "$def PLTStartName \"`name_of_dist_package \"$pname\"`\"" + fi + local dname + case "$pname" in + ( "plt" ) dname="PLT" ;; + ( "mz" ) dname="MzScheme" ;; + ( "full" ) dname="PLT-FULL" ;; + ( * ) exit_error "Unknown package name for exe installer: \"$pname\"" ;; + esac + if [[ "$releasing" != "yes" ]]; then + echo "$def PLTDirName \"$dname-$version\"" + else + echo "$def PLTDirName \"$dname\"" + fi + echo "$def PLTRegName \"$dname-$version\"" + if [[ "$pname" = "mz" ]]; then echo "$def SimpleInstaller"; fi + } > "plt-defs.nsh" \ + || exit_error "Could not write \"plt-defs.h\"" + local line="---------- plt-defs.nsh ----------" + echo "$line" + cat "plt-defs.nsh" + echo "$line" | sed 's/./-/g' + _tgzip "$nsistgz" * + _cd "$tmpdir" + _rm "$tmpdir/plt-nsis-$$" + _scp "$nsistgz" "${nsismachine}:$nsistgz" + _scp "$srctgz" "${nsismachine}:$tmptgz" + run_part "$nsismachine" \ + "do_tgz_to_exe" "$tmptgz" "$tmpexe" "$nsistgz" "$pname" "$@" + _scp "${nsismachine}:$tmpexe" "$tgtexe" +} +#---------------------------------------- + +do_installers_page_body() { # input: selector-html table-html + local selector="$1" table="$2"; shift 2 + local dtype dtypename dists dist distname platforms ptype ptypename + local d file fsize idx expl + local dists="plt mz full" + local dtypes="bin src" + { echo "" + echo "
" + echo "
" + echo "Distribution:   " + echo "Platform+Type:   " + echo "" + echo "
" + echo "" + echo "
" + echo "" + echo "" + } >> "$selector" + local NAcell="N/A" + local SRCcell="$cleantgz" + idx=0 + for dtype in $dtypes; do + dtypename="`name_of_dist_type \"$dtype\"`" || exit_error "Bad dist type" + echo "" >> "$table" + echo "$dtypename distributions" >> "$table" + for d in $dists; do + echo "`name_of_dist_package \"$d\"`" >> "$table" + done + echo "" >> "$table" + for ptype in `platforms_of_dist_type "$dtype"`; do + if [[ "$dtype" != "bin" || -d "$maindir/$bindir/$ptype" ]]; then + ptypename="`name_of_platform \"$ptype\"`" \ + || exit_error "Bad dist package" + echo "" >> "$table" + echo "$ptypename" >> "$table" + for d in $dists; do + if [[ "$dtype" = "src" ]]; then case "$d" in + ( "plt" | "mz" ) ;; + ( "full" ) echo "$SRCcell" >> "$table"; continue ;; + ( * ) echo "$NAcell" >> "$table"; continue ;; + esac; fi + distributions2[idx++]="$d-$dtype-$ptype" + file="`ls \"$d-$version-$dtype-$ptype.\"*`" + if [[ "$file" = "" ]]; then + echo "(missing)" >> "$table" + else + local fsize="`get_first du -h \"$file\"`" + echo "$file" >> "$table" + echo "($fsize)" >> "$table" + fi + done + echo "" >> "$table" + fi + done + done +} + +BUILD_INSTALLERS() { + + ## -------------------------------------------------------------------------- + separator "Creating platform-specific installers" + _rmd "$maindir/$instdir" + _cd "$maindir/$preinstdir" + html_begin "Pre-installers" + html_content_begin + html_table_begin + local tgz idx + idx=0 + for tgz in *.tgz; do + local dname="`basename \"$tgz\" .tgz`" + distributions1[idx++]="$dname" + local dpackage="` echo \"$dname\" | cut -d - -f 1`" + local dtype="` echo \"$dname\" | cut -d - -f 2`" + local dplatform="`echo \"$dname\" | cut -d - -f 3-`" + html_file_row "$tgz" "`name_of_dist_type \"$dtype\"` distribution of" \ + "`name_of_dist_package \"$dpackage\"` for" \ + "`name_of_platform \"$dplatform\"`" + convert="tgz_to_`installer_of_dist_type_platform \"$dtype-$dplatform\"`" + separator "Making \"$dpackage-$dtype\" installer for \"$dplatform\"" + show "Using \"$convert\" to convert \"$dname\"" + "$convert" "$maindir/$preinstdir/$tgz" \ + "$maindir/$instdir/$dpackage-$version-$dtype-$dplatform" \ + "$dpackage" "$dtype" "$dplatform" + _cd "$maindir/$preinstdir" + done + html_table_end + html_content_end + html_end + + _cd "$maindir/$instdir" + show "Making the distributions page" + _rm "$tmpdir/plt-tmp-selector" "$tmpdir/plt-tmp-table" + do_installers_page_body "$tmpdir/plt-tmp-selector" "$tmpdir/plt-tmp-table" + # selector page + html_begin "Installers" + html_content_begin + html_show -f "$tmpdir/plt-tmp-selector" + html_content_end + html_end + # static table page + html_begin "Installers (static)" "table.html" + html_content_begin + html_table_begin "all" + html_show -f "$tmpdir/plt-tmp-table" + _rm "$tmpdir/plt-tmp-selector" "$tmpdir/plt-tmp-table" + html_table_end + html_content_end + html_end + + local f sorted1 sorted2 + show "Checking generated pre-distribution and distributions on index page" + sorted1="`for f in \"${distributions1[@]}\"; do echo \"$f\"; done | sort`" + sorted2="`for f in \"${distributions2[@]}\"; do echo \"$f\"; done | sort`" + if [[ "$sorted1" = "$sorted2" ]]; then + show "File lists identical, good." + else + show "File lists do not match." + show "Generated pre-distributions:" + echo "$sorted1" + show "Indexed distributions:" + echo "$sorted2" + exit_error "Fix this script" + fi + +} + +## ============================================================================ + +move_from_maindir() { # input: file-name + if [[ -e "$maindir/$1" ]]; then + if [[ -e "$1" ]]; then _rmd "TEMP_WEB"; _mv "$1" "TEMP_WEB"; fi + _mv "$maindir/$1" . + _rm "TEMP_WEB" + elif [[ ! -e "$1" ]]; then exit_error "\"$1\" is not in $maindir or `pwd`" + else show "Skipping \"$1\"" + fi +} +copy_from() { # input: directory file-name + _rmcd "TEMP_WEB" + show "Copying: \"$1/$2\" to \"`pwd`\"" + ( cd "$1" ; tar cf - --exclude=".svn" "$2" ) | tar xf - \ + || exit_error "Could not copy \"$1/$2\" to \"`pwd`\"" + _cd ".." + if [[ -e "$2" ]]; then _mv "$2" "TEMP_WEB/TEMP_WEB"; fi + _mv "TEMP_WEB/$2" . + _rm "TEMP_WEB" +} + +BUILD_WEB() { + + local w="$prewebdir" + # cases for "hidden" results: building a major version, or a non-default path + if [[ "$reallyreleasing" = "yes" ]]; then w="$w/$version" + elif [[ "$svnpath" != "trunk" ]]; then w="$w/$svnpath" + elif [[ "$releasing" = "yes" ]]; then w="$w/$version" + fi + + ## -------------------------------------------------------------------------- + if [[ "$w" = "$prewebdir" ]]; then separator "Making external web pages" + else separator "Making external web pages at $w" + fi + + _mcd "$maindir/$w" + + html_begin "PLT Nightly Builds" + html_content_begin + html_table_begin + #---- + move_from_maindir "$installersdir" + html_file_row "$installersdir" "Installers" \ + "
(these are platform-specific distribution files, similar to" \ + "standard distributions.)" + #---- + move_from_maindir "$docdir" + html_file_row "$docdir" "Documentation files" + #---- + html_file_row "search.html" "Search the current sources and docs" + #---- + move_from_maindir "$bindir" + html_file_row "$bindir" "Platform-specific binary files" + #---- + move_from_maindir "$preinstdir" + html_file_row "$preinstdir" "Pre-installer files" \ + "
(these contain distribution files in tgz format, which are" \ + "used to create platform-specific installers)" + #---- + copy_from "$workdir" "$installdir" + html_file_row "$installdir" "A complete build tree (built on $platform)" + #---- + move_from_maindir "$cleantgz" + html_file_row "$cleantgz" \ + "The complete repository source tree packed in a gzipped tarball" + #---- + move_from_maindir "$srctgz" + html_file_row "$srctgz" "An archive containing only plt/src" \ + "
(can be used right after unpacking other tgz files to add" \ + "the src contents to an existing plt tree)" + #---- + html_file_row "script.html" "Sample scripts for using nightly builds" \ + "
(read this if you want to use automatic scripts to keep" \ + "an up-to-date installation)" + #---- + move_from_maindir "$stampfile" + html_file_row "$stampfile" "Timestamp+version file" \ + "
(updated only after a successful build, useful for" \ + "automatic scripts)" + #---- + # don't copy this, since it's still being written to + _rm "$scriptlogfile"; ln "$maindir/$scriptlogfile" "$scriptlogfile" + html_file_row "$scriptlogfile" "Full build log" + #---- + html_table_end + html_content_end + html_end + + ## -------------------------------------------------------------------------- + separator "Making and installing web content" + + _rmcd "$maindir/$webdir" + # distribute only if this is a normal build + if [[ "$w" = "$prewebdir" ]]; then + _run "$webscript" --dist + else + _run "$webscript" + fi + + ## -------------------------------------------------------------------------- + separator "Patching up pre-release web content" + + _cd "$maindir/$w" + _run "$htmlpatchscript" "$maindir/$webdir/pre" + + ## -------------------------------------------------------------------------- + if [[ "$w" = "$prewebdir" ]]; then + separator "Creating a site-map" + _cd "$maindir/$w" + _run "$sitemapdir/sitemap_gen.py" --config="$sitemapdir/plt-pre.xml" \ + > /dev/null + fi + +} + + +############################################################################### +### Main dispatch + +if [[ "$1" = "--dispatch" ]]; then + shift + while [[ "$1" = *"="* ]]; do eval "export $1"; shift; done + machine="$1"; go="$2"; shift 2 + init_svnpath_vars # set the svnpath variables according to the env vars + machineget platform # set the global platform for dependable script pieces + show "Working on $machine($hostname)" + show "Dispatching to $go($*)" + "$go" "$@" + show "Done working on $machine($hostname)" +elif [[ "$scriptlog" = "yes" ]]; then + show "Working on $machine($hostname)" + rm -f "$maindir/$scriptlogfile" + { echo "This is the build log, generated by $buildscript"; echo "" + echo "Search for \"BOOM\" for any errors."; echo "" + # set | grep "^[a-z].*=" | awk '{ print " " $0 }'; echo "" + } > "$maindir/$scriptlogfile" + if [[ "$scriptlog" = "only" ]]; then + exec >> "$maindir/$scriptlogfile" 2>&1 + MAIN "$@" + else + MAIN "$@" 2>&1 | tee -a "$maindir/$scriptlogfile" + fi +else + show "Working on $machine($hostname)" + MAIN "$@" +fi + +exit + +############################################################################### diff --git a/collects/meta/build/bundle b/collects/meta/build/bundle new file mode 100755 index 0000000000..7cfa88e925 --- /dev/null +++ b/collects/meta/build/bundle @@ -0,0 +1,562 @@ +#!/bin/env mzscheme +;; -*- scheme -*- + +#lang scheme/base + +(require scheme/cmdline scheme/runtime-path scheme/match scheme/promise + meta/checker (prefix-in dist: meta/dist-specs) meta/specs + (for-syntax scheme/base) ; for runtime-path + (except-in scheme/mpair mappend) + (only-in (lib "process.ss") system)) + +(define (/-ify x) + (regexp-replace #rx"/?$" (if (path? x) (path->string x) x) "/")) +(define home/ (/-ify (expand-user-path "~scheme"))) +(define binaries/ (/-ify (build-path home/ "binaries"))) +(define target/ (/-ify (build-path home/ "pre-installers"))) +(define plt/ (/-ify (or (getenv "PLTHOME") + (error 'bundle "PLTHOME is not defined")))) +(define plt-base/ (/-ify (simplify-path (build-path plt/ 'up) #f))) +(define plt/-name (let-values ([(base name dir?) (split-path plt/)]) + (path-element->string name))) + +(define cd current-directory) + +(define *readme-file* + (build-path plt/ "readme.txt")) +(define *info-domain-file* + (build-path plt/ "collects" "info-domain" "compiled" "cache.rktd")) + +(define *info-domain-cache* #f) + +(define-runtime-path *spec-file* "distribution-specs") +(define-runtime-path *readme-specs-file* "readme-specs") + +(define *verify?* #t) +(define *btgz?* #t) +(define *pack?* #t) +(define *root?* #t) +(define *release?* #f) +(define *verbose?* 'yes) ; #t, #f, or else -- show stderr stuff but not stdout + +;;; =========================================================================== +;;; Utilities etc + +(define concat string-append) + +(define (sort* l) + (sort l stringstring (apply directory-list args)))) + +(define (dprintf fmt . args) + (when *verbose?* + (apply fprintf (current-error-port) fmt args) + (flush-output (current-error-port)))) + +;;; =========================================================================== +;;; Tree utilities + +;; path -> tree +;; Same as get-tree, but lists the contents of a tgz file via pax. +(define (get-tgz-tree tgz) + (define base (regexp-replace #rx"/$" (path->string (cd)) "")) + (define tgz-name + (regexp-replace #rx"^.*/" (if (path? tgz) (path->string tgz) tgz) "")) + (define (tree+rest paths curdir) + (define cur-rx (regexp (concat "^" (regexp-quote curdir)))) + (define m + (let ([m (and (pair? paths) + (regexp-match-positions cur-rx (car paths)))]) + (and m (regexp-match-positions #rx"/.*/" (car paths) (cdar m))))) + (if m + ;; we have too many "/"s => need to reconstruct a fake intermediate dir + (tree+rest (cons (substring (car paths) 0 (add1 (caar m))) paths) curdir) + (let loop ([paths paths] [contents '()]) + (when (pair? paths) + (prop-set! (car paths) 'tgz tgz-name) + (prop-set! (car paths) 'base base) + (prop-set! + (car paths) 'name + (cond [(regexp-match #rx"^(?:.*/)?([^/]+)/?$" (car paths)) => cadr] + [else (error 'get-tgz-tree + "bad path name: ~s" (car paths))]))) + (if (and (pair? paths) (regexp-match? cur-rx (car paths))) + ;; still in the same subtree + (if (regexp-match? #rx"/$" (car paths)) + ;; new directory + (let-values ([(tree rest) (tree+rest (cdr paths) (car paths))]) + (loop rest (cons tree contents))) + ;; new file + (loop (cdr paths) (cons (car paths) contents))) + ;; in a new subtree + (values (cons curdir (reverse contents)) paths))))) + (define-values (p pout pin perr) + (subprocess #f /dev/null-in (current-error-port) /tar "tzf" tgz)) + (parameterize ([current-input-port pout]) + (let loop ([lines '()]) + (let ([line (read-line)]) + (if (eof-object? line) + (let ([paths (sort* (reverse lines))]) + (subprocess-wait p) + (unless (eq? 0 (subprocess-status p)) + (error 'get-tgz-tree "`tar' failed.")) + (let-values ([(tree rest) (tree+rest paths "")]) + (if (null? rest) + (cdr tree) + (error 'get-tgz-tree "something bad happened (~s...)" + (car paths))))) + (loop (cons line lines))))))) + +;;; =========================================================================== +;;; Spec management + +(define *readme-specs* (make-parameter #f)) + +;;; =========================================================================== +;;; Start working + +(register-macros!) + +(define *platforms* #f) +(define *bin-types* #f) +(define *src-types* #f) +(define *platform-tree-lists* #f) +(define /pax #f) +(define /tar #f) +(define /dev/null-out #f) +(define /dev/null-in #f) + +(define (process-command-line) + (command-line + #:multi + ["+d" "Verify dependencies (default)" (set! *verify?* #t)] + ["-d" "Don't verify dependencies" (set! *verify?* #f)] + ["+v" "Verbose mode (on stdout)" (set! *verbose?* #t)] + ["-v" "Normal output (only stderr) (default)" (set! *verbose?* 'yes)] + ["-q" "Quiet mode" (set! *verbose?* #f)] + ["+b" "Create binary tgzs (default)" (set! *btgz?* #t)] + ["-b" "Skip binary tgzs, re-use binary trees" (set! *btgz?* #f)] + ["+p" "Pack distributions (default)" (set! *pack?* #t)] + ["-p" "Skip packing" (set! *pack?* #f)] + ["+r" "chown the contents to root (default)" (set! *root?* #t)] + ["-r" "Do not chown the contents to root" (set! *root?* #f)] + ["++release" "Build for a release" (set! *release?* #t)] + ["-o" dest "Destination directory" (set! target/ (/-ify dest))] + ["--text" "Stands for -d +v -b -p -r (useful for debugging)" + (set!-values (*verify?* *verbose?* *btgz?* *pack?* *root?*) + (values #f #t #f #f #f))]) + (current-verbose-port (and *verbose?* current-error-port))) + +;; specs can have `lambda' expressions to evaluate, do it in this context +(define-namespace-anchor bundle-specs) + +(define (read-spec-file file [param *specs*]) + (process-specs + (with-input-from-file file + (lambda () + (let loop ([xs '()]) + (let ([x (read)]) + (if (eof-object? x) (reverse xs) (loop (cons x xs))))))) + param)) + +(define (read-specs) + (current-namespace (namespace-anchor->namespace bundle-specs)) + (dprintf "Reading specs...") + (dist:register-specs!) + (read-spec-file *readme-specs-file* *readme-specs*) + (dprintf " done.\n")) + +(define (input-tgz-name? f) + (let ([f (if (path? f) (path->string f) f)]) + ;; names of tgzs that are not the generated binary ones + (and (regexp-match? #rx"\\.tgz$" f) + (not (regexp-match? #rx"-binaries\\.tgz$" f))))) + +(define (initialize) + (when *release?* (*environment* (cons 'release (*environment*)))) + (set! /pax (or (find-executable-path "pax" #f) + (error "error: couldn't find a `pax' executable"))) + (set! /tar (or (find-executable-path "gtar" #f) + (error "error: couldn't find a `gtar' executable"))) + (set! /dev/null-out (open-output-file "/dev/null" #:exists 'append)) + (set! /dev/null-in (open-input-file "/dev/null")) + (unless (directory-exists? target/) (make-directory target/)) + (let ([d (ormap (lambda (x) (and (not (directory-exists? x)) x)) + (list home/ plt/ binaries/ target/))]) + (when d (error 'bundle "directory not found: ~a" d))) + (set! *platforms* + (parameterize ([cd binaries/]) + (filter (lambda (x) + (and (not (regexp-match? #rx"^[.]" x)) + (directory-exists? x))) + (dir-list)))) + (set! *bin-types* (map string->symbol *platforms*)) + (set! *src-types* + (let loop ([bins *bin-types*] [r '()]) + (if (null? bins) + (reverse r) + (let* ([bin (car bins)] [src (get-tag bin)]) + (cond + [(not src) (error 'binaries "no type assigned to `~e'" bin)] + [(not (= 1 (length src))) + (error 'binaries "bad type assignment for `~e': ~e" bin src)] + [else (loop (cdr bins) + (if (memq (car src) r) r (cons (car src) r)))]))))) + (dprintf "Scanning full tgzs") + (set! *platform-tree-lists* + (parameterize ([cd binaries/]) + (map (lambda (platform) + (dprintf ".") + (parameterize ([cd platform]) + ;; if no btgz *and* "plt" already created then use get-tree + ;; (useful when debugging stuff so re-use pre made ones) + ;; should work the same with an old tree + (if (and (directory-exists? "plt") (not *btgz?*)) + (filtered-map + (lambda (x) ; only directories contain stuff we need + (and (directory-exists? x) (get-tree x))) + (dir-list)) + (let ([trees (filtered-map + (lambda (x) + (and (file-exists? x) (input-tgz-name? x) + (get-tgz-tree x))) + (dir-list))]) + (tag (list (string->symbol platform)) + (map (lambda (tree) (tree-filter 'binaries tree)) + (apply append trees))))))) + *platforms*))) + (dprintf " done.\n") + (for-each (lambda (platform trees) + (when (null? trees) + (error 'binaries "no binaries found for ~s" platform))) + *platforms* *platform-tree-lists*) + ;; Create the readme file so it is included with the plt tree + (with-output-to-file *readme-file* newline #:exists 'truncate) + ;; Get the plt tree, remove junk and binary stuff + (set-plt-tree! plt-base/ plt/-name *platform-tree-lists*) + (set-bin-files-delayed-lists! + (delay (map (lambda (trees) + (sort* (mappend tree-flatten (add-trees trees)))) + *platform-tree-lists*))) + ;; Get the plt tree, remove junk and binary stuff + (delete-file *readme-file*)) + +;; works with any newline format, expects text that always ends with a newline, +;; does not handle tabs, does not handle prefix whitespaces, is not efficient. +(define (wrap-string str width) + (define (wrap-line str nl r) + (cond [(<= (string-length str) width) (list* nl str r)] + [(or (regexp-match-positions #rx"^.*( +)" str 0 width) + ;; no space in limit, go for the first space afterwards + (regexp-match-positions #rx"^.*?( +)" str)) + => (lambda (m) + (wrap-line (substring str (cdadr m)) nl + (list* nl (substring str 0 (caadr m)) r)))] + [else (list* nl str r)])) + (let loop ([str str] [r '()]) + (let ([m (regexp-match #rx"^(.*?)(\r\n|\r|\n)(.*)$" str)]) + (if m + (loop (cadddr m) (wrap-line (cadr m) (caddr m) r)) + (apply string-append (reverse (cons str r))))))) + +(define (make-readme) + (let ([readme (parameterize ([*specs* (*readme-specs*)]) + (apply string-append (expand-spec 'readme)))]) + (display (wrap-string readme 72)))) + +(define (make-info-domain trees) + (unless (= 1 (length trees)) + (error 'make-info-domain "got zero or multiple trees: ~e" trees)) + (let* ([collects (or (tree-filter "/plt/collects/" (car trees)) + (error 'make-info-domain "got no collects in tree"))] + [info (filter (lambda (x) + (let ([x (path->string (bytes->path (car x)))]) + (pair? (tree-filter (concat "/plt/collects/" x) + collects)))) + *info-domain-cache*)]) + (lambda () (write info) (newline)))) + +(define (create-binaries platform trees) + (parameterize ([cd (build-path binaries/ platform)]) + (let ([full-tgz (concat "plt-"platform"-full.tgz")] + [bin-tgz (concat "plt-"platform"-binaries.tgz")] + [all-tgzs (filter input-tgz-name? + (map path->string (directory-list)))]) + (unless (and (directory-exists? "plt") (not *btgz?*)) + (dprintf "Unpacking binaries in ~s ~a\n" platform all-tgzs) + ;; even if a "plt" directory exists, we just overwrite the same stuff + (unless (member full-tgz all-tgzs) + (error 'create-binaries "~a/~a not found" (cd) full-tgz)) + (for ([tgz all-tgzs]) (unpack tgz trees))) + (when *btgz?* + (dprintf "Creating ~s\n" bin-tgz) + (when (file-exists? bin-tgz) (delete-file bin-tgz)) + (let-values ([(p pout pin perr) + (subprocess + (current-output-port) /dev/null-in (current-error-port) + ;; see below for flag explanations + /pax "-w" "-x" "ustar" "-z" "-f" bin-tgz + ;; only pack the plt dir (only exception is Libraries on + ;; OSX, but that has its own dir) + "plt")]) + (subprocess-wait p)))))) + +(define (pack archive trees prefix) + ;; `pax' is used to create the tgz archives -- the main reasons for using it + ;; is the fact that it can generate portable "ustar" tar files, and that it + ;; is flexible enough to allow replacing file names, so we can collect files + ;; from different directories and make them all appear in a single one in the + ;; resulting archive. + (when (eq? #t *verbose?*) (printf "~a:\n" archive)) + (cond [*pack?* + (dprintf " packing...") + (when (file-exists? archive) (delete-file archive)) + (let*-values ([(output) (if (eq? #t *verbose?*) + (current-output-port) /dev/null-out)] + [(p pout pin perr) + ;; Note: pax prints converted paths on stderr, so + ;; silence it too unless verbose. Use only for + ;; debugging. + (subprocess + output #f output + /pax + "-w" ; write + "-x" "ustar" ; create a POSIX ustar format + "-z" ; gzip the archive + "-d" ; dont go down directories implicitly + "-s" (format ",^~a,,p" prefix) ; delete base paths + "-f" archive ; pack to this file + )]) + (parameterize ([current-output-port pin]) + (for ([t trees]) (print-tree t 'full))) + (close-output-port pin) + (subprocess-wait p) + (unless (eq? 0 (subprocess-status p)) + (error 'pack "`pax' failed.")))] + [(eq? #t *verbose?*) (for ([t trees]) (print-tree t))]) + (when (eq? #t *verbose?*) (newline)) + (flush-output)) + +(define (unpack archive trees) + ;; unpack using tar (doesn't look like there's a way to unpack according to + ;; files from stdin with pax, and it uses gnu format with @LongLinks). + (let-values + ([(p pout pin perr) + (subprocess + (current-output-port) #f (current-error-port) /tar + "x" ; extract + "-z" ; gunzip the archive + "-p" ; preserve permissions + "--files-from=-" ; read files from stdin + "-f" archive ; unpack this file + )] + [(trees) + (map (lambda (t) + (tree-filter + (lambda (t) + ;; Problem: if this returns #t/#f only, then the sources can + ;; come from multiple tgz since each file will be identified + ;; by itself. But if this is done, then no empty directories + ;; will be included (see `tree-filter' comment) and this will + ;; later be a problem (to have an empty dir in the tree but + ;; not on disk) -- so return '+ and as soon as a root is + ;; identified with the tgz, all of it will be used. + (and + (equal? archive + (prop-get (tree-path t) 'tgz + (lambda () + (error 'unpack + "no `tgz' property for ~e" t)))) + '+)) + t)) + trees)]) + (parameterize ([current-output-port pin]) + (for ([t trees]) (print-tree t 'only-files))) + (close-output-port pin) + (subprocess-wait p) + (unless (eq? 0 (subprocess-status p)) (error 'unpack "`tar' failed.")))) + +;; This code implements the binary filtering of 3m/cgc files, see +;; `binary-keep/throw-templates' in "distribution-specs.ss". +;; Careful when editing! +(define (filter-bintree tree) + (define (get-pattern spec) + (let ([rx (expand-spec spec)]) + (if (and (pair? rx) (null? (cdr rx)) (string? (car rx))) + (car rx) + (error 'filter-bintree "bad value for ~e: ~e" spec rx)))) + (define keep-pattern (get-pattern 'binary-keep)) + (define throw-pattern (get-pattern 'binary-throw)) + (define keep-rx (regexpify-spec (string-append "*" keep-pattern "*"))) + (define throw-rx (regexpify-spec (string-append "*" throw-pattern "*"))) + (define templates + (let ([ts (expand-spec 'binary-keep/throw-templates)]) + (for ([t ts]) + (unless (and (string? t) + ;; verify that it has exactly one "<...!...>" pattern + (regexp-match? #rx"^[^]*<[^]*![^]*>[^]*$" t)) + (error 'filter-bintree "bad keep/throw template: ~e" t))) + ts)) + (define (make-matcher x) ; matchers return match-positions or #f + (let ([rxs (map (lambda (t) + (let* ([x (regexp-replace #rx"!" t x)] + [x (object-name (regexpify-spec x #t))] + [x (regexp-replace #rx"<(.*)>" x "(\\1)")]) + (regexp x))) + templates)]) + (lambda (p) (ormap (lambda (rx) (regexp-match-positions rx p)) rxs)))) + (define (rassoc x l) + (and (pair? l) (if (equal? x (cdar l)) (car l) (rassoc x (cdr l))))) + (define keep? (make-matcher keep-pattern)) + (define throw? (make-matcher throw-pattern)) + (define existing-paths (tree-flatten tree)) + ;; The two `*-paths' values are association lists: (( . ) ...) + ;; both sides are unique in each list, the lhs is always an existing path + (define (find-paths pred? mode rx) + (define res '()) + (let loop ([t tree]) + (let ([p (tree-path t)]) + (cond [(pred? p) + => (lambda (m) + (let ([plain (string-append (substring p 0 (caadr m)) + (substring p (cdadr m)))]) + (when (rassoc plain res) + (error 'filter-bintree + "two ~s templates have the same plain: ~e -> ~e" + mode p plain)) + (set! res `((,p . ,plain) ,@res))) + #t)] + [(regexp-match? rx p) + ;; other matches are not allowed, unless on a directory where + ;; all files are selected + (when (or (not (pair? t)) + (memq #f (map loop (cdr t)))) + (error 'filter-bintree + "~s path uncovered by patterns: ~e" mode p)) + #t] + [(pair? t) (not (memq #f (map loop (cdr t))))] + [else #f]))) + res) + (define keep-paths (find-paths keep? 'keep keep-rx)) + (define throw-paths (find-paths throw? 'throw throw-rx)) + (for ([k keep-paths]) + (when (assoc (car k) throw-paths) + (error 'filter-bintree + "a path matched both keep and throw patterns: ~s" (car k)))) + (let* ([ps (map cdr keep-paths)] + [ps (append ps (remove* ps (map cdr throw-paths)))] + [scan (lambda (f paths) + (map (lambda (p) (cond [(f p paths) => car] [else #f])) ps))] + [plain (scan member existing-paths)] + [keep (scan rassoc keep-paths)] + [throw (scan rassoc throw-paths)]) + (define del + (map (lambda (p k t) + (cond + [(and p k t) (error 'filter-bintree "got keep+throw+plain")] + [(or k t) (or t p)] + [else (error 'filter-bintree "internal error")])) + plain keep throw)) + (tree-filter `(not (or ,(lambda (t) (and (memq (tree-path t) del) '+)) + binary-throw-more)) + tree))) + +;; This is hooked below as a `distribute!' spec macro, and invoked through +;; expand-spec. +(define (distribute!) + (define (distribute tree) (tree-filter 'distribution tree)) + (let* ([features (filter string? (reverse (*environment*)))] + [name (apply concat (cdr (mappend (lambda (x) (list "-" x)) + features)))] + [features (map string->symbol features)] + [bin? (memq 'bin features)] + [src? (memq 'src features)] + [full? (memq 'full features)]) + (when (and bin? src?) + (error 'distribute! "bad configuration (both bin & src): ~e" features)) + (unless (or bin? src?) + (error 'distribute! "bad configuration (both bin & src): ~e" features)) + (for ([type (if bin? *bin-types* *src-types*)] + ;; this is unused if bin? is false + [bin-trees (if bin? *platform-tree-lists* *src-types*)]) + (tag (cons type features) + (let ([name (format "~a-~a.tgz" name type)]) + (dprintf "Creating ~s: filtering..." name) + (let ([trees (add-trees + (cons (distribute (get-plt-tree)) + (if bin? + (tag 'in-binary-tree + (map (if full? + distribute + (lambda (t) + (distribute (filter-bintree t)))) + bin-trees)) + '())))]) + ;; make it possible to write these files + (chown 'me *readme-file* *info-domain-file*) + (with-output-to-file *readme-file* #:exists 'truncate make-readme) + (with-output-to-file *info-domain-file* #:exists 'truncate + (make-info-domain trees)) + (chown 'root *readme-file* *info-domain-file*) + (pack (concat target/ name) trees + (if bin? + (format "\\(~a\\|~a~a/\\)" plt-base/ binaries/ type) + plt-base/))) + (dprintf " done.\n"))))) + '()) +(register-spec! 'distribute! + (lambda () (when (or *pack?* (eq? #t *verbose?*)) (distribute!)))) + +(register-spec! 'verify! (lambda () (when *verify?* (verify!)))) + +;; make auto-generated files exist +(define (create-generated-files) + ;; no need to create the cache.ss, since it's there, but read it + (set! *info-domain-cache* + (with-input-from-file *info-domain-file* read)) + (with-output-to-file *readme-file* newline #:exists 'truncate)) +(define (delete-generated-files) + ;; don't delete the cache, but write original unfiltered contents + (with-output-to-file *info-domain-file* + (lambda () (write *info-domain-cache*) (newline)) #:exists 'truncate) + (delete-file *readme-file*)) + +;; mimic the chown syntax +(define (chown #:rec [rec #f] who path . paths) + (when (and *root?* *pack?*) + (let ([user:group + (case who [(root) "root:root"] [(me) (force whoami)] + [else (error 'chown "unknown user spec: ~e" who)])] + [paths (map (lambda (x) (if (path? x) (path->string x) x)) + (cons path paths))]) + (when (ormap (lambda (x) (regexp-match? #rx"[^/a-zA-Z0-9_ .+-]" x)) paths) + (error 'chown "got a path that needs shell-quoting: ~a" paths)) + (system (format "sudo chown ~a ~a ~a" (if rec "-R" "") user:group + (apply string-append + (map (lambda (p) (format " \"~a\"" p)) paths))))))) + +(define whoami + (delay + (parameterize ([current-output-port (open-output-string)]) + (system "echo \"`id -nu`:`id -ng`\"") + (regexp-replace + #rx"[ \r\n]*$" (get-output-string (current-output-port)) "")))) + +(define (chown-dirs-to who) + (when (and *root?* *pack?*) + (dprintf "Changing owner to ~a..." who) + (for ([dir (list plt/ binaries/)]) + (parameterize ([cd dir]) (chown #:rec #t who "."))) + (dprintf " done.\n"))) + +(process-command-line) +(read-specs) +(initialize) +(for-each create-binaries *platforms* *platform-tree-lists*) +(dynamic-wind + (lambda () (create-generated-files) (chown-dirs-to 'root)) + ;; Start the verification and distribution + (lambda () (expand-spec 'distributions) (void)) + (lambda () (chown-dirs-to 'me) (delete-generated-files))) diff --git a/collects/meta/build/info.rkt b/collects/meta/build/info.rkt new file mode 100644 index 0000000000..dd5e03927c --- /dev/null +++ b/collects/meta/build/info.rkt @@ -0,0 +1,2 @@ +#lang setup/infotab +(define compile-omit-paths 'all) diff --git a/collects/meta/build/make-patch b/collects/meta/build/make-patch new file mode 100755 index 0000000000..ea8bba724d --- /dev/null +++ b/collects/meta/build/make-patch @@ -0,0 +1,200 @@ +#!/bin/sh +#| -*- scheme -*- +exec racket "$0" + +Instructions: + +* Create a copy of a distributed PLT tree, change all files that need to change + for the patch. If this is not a first patch, then begin this process with a + tree that has the previous patch applied. (Patch numbers should go from 1 + up.) + + I do this: + cd + svn co http://svn.plt-scheme.org/plt/tags/ patched + cd patched + svn merge -r: http://svn.plt-scheme.org/plt/trunk + ... more merges as needed ... + +* Make sure that "collects/version/patchlevel.ss" contains the new patch + number, and add comments about this patch, with a list of files that are + modified. (This is good for the next step, when doing additional patches.) + +* In the code below, + - set `plt-version' to the version you're patching (base version, the code + will expect `(version)' to return an equal value). + - set `plt-base' to the location of the patched PLT tree on your system. + - put the list of files in the `files' definition. Each patch should also + have all preceding patches in it, which means that if you're patching an + already-patched tree, then you should add more files. (This is why it is + good to keep track of the modified files.) Note that + "collects/version/patchlevel.ss" must be included in this list, and that + the file does have the correct patchlevel number (there is currently no way + to check whether the patchlevel makes sense). + +* Note that the patch is a collection with the same name ("plt-patch" below). + This means that installing a patch is a process that first overwrites any + preexisting patch collections. This is fine, because patches are linear and + cumulative. The worst that can happen is that someone downloads a patch + older than what's installed -- in that case the PLT tree already has the + higher patch level, and when the collection's installer is doing its work it + will simply be skipped (a successful patch installation happens only once, + and is later skipped when setup-plt is re-run). + +* Test, put in "iplt/web/download/patches/", publish new html, announce. + +* Commit the patched tree as a new tag. + +|# + +#lang mzscheme + +;; ============================================================================ +;; customization (items marked with `[*]' should be edited for all patches) + +;; [*] which PLT version is this patch for? +(define plt-version "370") + +;; [*] location of a patched PLT tree +(define plt-base "~/patched") + +;; [*] patched files in this tree (including previously patched files, if any) +(define files '("collects/version/patchlevel.ss" + "collects/drscheme/private/module-language.ss" + "collects/framework/private/scheme.ss" + "collects/slideshow/tool.ss" + "collects/lang/htdp-langs.ss" + "collects/drscheme/private/unit.ss")) + +;; message to show after the last `Done' (#f => no extra text) +(define exit-message "please restart DrScheme") + +;; template for the output archive file +(define patchfile-template "/tmp/plt-patch-v~ap~a.plt") + +;; template for archive name +(define name-template "PLT Scheme v~ap~a patch") + +;; patchlevel file in the PLT tree (must be included in `files' above) +(define patchlevel-file "collects/version/patchlevel.ss") + +;; ============================================================================ +;; code folows + +(require (lib "list.ss") (lib "pack.ss" "setup")) + +;; move patchlevel file to the end +(unless (member patchlevel-file files) + (error 'make-patch + "missing patchlevel file (~a) in the list of files" patchlevel-file)) +(set! files (append (remove patchlevel-file files) (list patchlevel-file))) + +(unless (absolute-path? plt-base) + (error 'make-patch "plt-base is not an absolute path: ~a" plt-base)) + +(define patchlevel + ;; use `dynamic-require' -- not `require' since the patch can be built by a + ;; different PLT installation + (dynamic-require (build-path plt-base patchlevel-file) 'patchlevel)) +(define archive-name (format name-template plt-version patchlevel)) +(define archive-filename (format patchfile-template plt-version patchlevel)) + +(define unpacker-body + `((define me ,(format "v~ap~a-patch" plt-version patchlevel)) + (define (error* fmt . args) + (error (string-append "ERROR applying "me": " (apply format fmt args)))) + (define (message fmt . args) + (printf "*** ~a: ~a\n" me (apply format fmt args))) + (define collects-dir (find-collects-dir)) + (cond + [(not (equal? ,plt-version (version))) + (error* "bad version number; this patch is for version ~a, you have ~a" + ',plt-version (version))] + [(= patchlevel ,patchlevel) (error* "Already installed")] + [(> patchlevel ,patchlevel) (error* "Newer patch installed")] + [else (message "Applying patch...")]) + (mzuntar void) + (message "Patch applied successfully, recompiling...") + ;; return a list of all toplevel collections to recompile + ;; (define (has-info? c) + ;; (file-exists? (build-path collects-dir c "info.ss"))) + ;; (let* ([cs (directory-list collects-dir)] + ;; [cs (filter has-info? cs)] + ;; [cs (map path->string cs)] + ;; [cs (sort cs string\n") +(define end-pattern #"\n") + +(define begin-re (regexp-replace #"XXX" begin-pattern #"([^<> ]+)")) +(define end-re (regexp-replace #"XXX" end-pattern #"([^<> ]+)")) + +(define (regexp-match1 rx inp . disp?) + (cond [(if (and (pair? disp?) (car disp?)) + (regexp-match rx inp 0 #f (current-output-port)) + (regexp-match rx inp)) + => cadr] + [else #f])) + +(define (eprintf fmt . args) + (apply fprintf (current-error-port) fmt args)) + +(define (patch-file skeleton html) + (let ([skeleton (open-input-file skeleton)] + [html (open-input-file html)]) + (let loop () + (let ([begin-tag (regexp-match1 begin-re skeleton #t)]) + ;; (eprintf ">>> skeleton: ~a begin\n" begin-tag) + (if begin-tag + (let ([begin-tag* (regexp-match1 begin-re html)]) + ;; (eprintf ">>> html: ~a begin\n" begin-tag*) + (unless (equal? begin-tag begin-tag*) + (error 'patch-html + "mismatched input begin-tags, expecting ~a got ~a" + begin-tag begin-tag*)) + ;; leave tags in, so it is possible to run this script again + (display (regexp-replace #"XXX" begin-pattern begin-tag)) + (let ([end-tag (regexp-match1 end-re html #t)]) + ;; (eprintf ">>> html: ~a end\n" end-tag) + (unless (equal? end-tag begin-tag) + (error 'patch-html "bad end tag (~a) for begin tag (~a)" + end-tag begin-tag)) + (let ([end-tag* (regexp-match1 end-re skeleton)]) + ;; (eprintf ">>> skeleton: ~a end\n" end-tag*) + (unless (equal? end-tag end-tag*) + (error 'patch-html + "mismatched input end-tags, expecting ~a got ~a" + end-tag end-tag*)) + ;; leave tags in, so it is possible to run this script again + (display (regexp-replace #"XXX" end-pattern end-tag)) + (loop)))) + (cond [(regexp-match1 begin-re html) => + (lambda (tag) + (error 'patch-html + "mismatched input tags, extraneous tag in target: ~a" + tag))])))) + (close-input-port skeleton) + (close-input-port html))) + +(define (patch-dir skeleton-dir) + (printf "patching directory: ~a\n" (current-directory)) + (for-each (lambda (p) + (if (cdr p) + (begin + (unless (directory-exists? (car p)) (make-directory (car p))) + (parameterize ([current-directory (car p)]) + (patch-dir (build-path skeleton-dir (car p))))) + (let ([skeleton (build-path skeleton-dir (car p))]) + (if (file-exists? (car p)) + (let ([tmp "/tmp/patch-html-file"]) + (printf "patching file: ~a\n" + (build-path (current-directory) (car p))) + (with-output-to-file tmp + (lambda () (patch-file skeleton (car p))) + #:exists 'truncate) + (delete-file (car p)) + (copy-file tmp (car p)) + (delete-file tmp)) + (begin (printf "copying file: ~a/~a\n" + (current-directory) (car p)) + (copy-file skeleton (car p))))))) + (parameterize ([current-directory skeleton-dir]) + (map (lambda (p) + (cons p (cond [(file-exists? p) #f] + [(directory-exists? p) #t] + [else (error "internal-error")]))) + (directory-list))))) + +(define (main arg) + (patch-dir (path->complete-path arg))) diff --git a/collects/meta/build/readme-specs b/collects/meta/build/readme-specs new file mode 100644 index 0000000000..bc73068d32 --- /dev/null +++ b/collects/meta/build/readme-specs @@ -0,0 +1,137 @@ +;; -*- scheme -*- + +;; This file defines the readme files for the different distributions. It is +;; similar to the distribution specs file, see that for explanations on its +;; format. + +\\ := (cond win => "\r\n" + ;; (or ppc-osx-mac i386-osx-mac) => "\r" ; is this still needed? + else => "\n" ) + +package-name +:= (cond full => "PLT Scheme Full Repository" + plt => "PLT Scheme" + dr => "DrScheme" + mr => "MrEd" + mz => "MzScheme") + +dist-type +:= (cond src => "source" + else => "executable") + +platform-type +:= (cond unix => "Unix" + mac => "Macintosh" + win => "Windows") +platform +:= (cond i386-linux => "Linux (i386)" + i386-linux-gcc2 => "Linux (i386/gcc2)" + i386-linux-fc2 => "Fedora Core 2 (i386)" + i386-linux-fc5 => "Fedora Core 5 (i386)" + i386-linux-fc6 => "Fedora Core 6 (i386)" + i386-linux-f7 => "Fedora 7 (i386)" + x86_64-linux-f7 => "Fedora 7 (x86_64)" + i386-linux-f9 => "Fedora 9 (i386)" + i386-linux-f12 => "Fedora 12 (i386)" + i386-linux-debian => "Debian Stable (i386)" + i386-linux-debian-testing => "Debian Testing (i386)" + i386-linux-debian-unstable => "Debian Unstable (i386)" + i386-linux-ubuntu => "Ubuntu (i386)" + i386-linux-ubuntu-dapper => "Ubuntu Dapper (i386)" + i386-linux-ubuntu-edgy => "Ubuntu Edgy (i386)" + i386-linux-ubuntu-feisty => "Ubuntu Feisty (i386)" + i386-linux-ubuntu-hardy => "Ubuntu Hardy (i386)" + i386-linux-ubuntu-intrepid => "Ubuntu Intrepid (i386)" + i386-linux-ubuntu-jaunty => "Ubuntu Jaunty (i386)" + i386-freebsd => "FreeBSD (i386)" + sparc-solaris => "Solaris" + ppc-osx-mac => "Mac OS X (PPC)" + i386-osx-mac => "Mac OS X (Intel)" + ppc-darwin => "Mac OS X using X11 (PPC)" + i386-darwin => "Mac OS X using X11 (Intel)" + i386-win32 => "Windows" + else => platform-type) + +executable := (cond mac => "application" else => "executable") +dir := (cond (or win mac) => "folder" else => "directory") + +version := (lambda () (version)) + +drscheme* +:= (cond unix => "bin/drscheme" win => "DrScheme.exe" mac => "DrScheme") +plt-help* +:= (cond unix => "bin/plt-help" win => "plt-help.exe" mac => "bin/plt-help") +setup-plt* +:= (cond unix => "bin/setup-plt" win => "Setup PLT.exe" mac => "bin/setup-plt") +mred* +:= (cond unix => "bin/mred" win => "MrEd.exe" mac => "MrEd") +mzscheme* +:= (cond unix => "bin/mzscheme" win => "MzScheme.exe" mac => "bin/mzscheme") +mzc* +:= (cond unix => "bin/mzc" win => "mzc.exe" mac => "bin/mzc") +planet* +:= (cond unix => "bin/planet" win => "planet.exe" mac => "bin/planet") + +intro +:= "This is the "package-name" v"(version)" "dist-type" package "dir" for " + platform"." \\ + +main-exe +:= "These are some of the important "executable"s that are included:" \\ + \\ + (cond (or dr plt full) => + " "drscheme*" -- the PLT Scheme development environment" \\ \\) + " "mzscheme*" -- a text-only Scheme interpreter" \\ + (cond (or md dr plt full) => + " "mred*" -- a graphical Scheme interpreter" \\) + " "mzc*" -- command-line tool for creating executables, etc." \\ + (cond (or dr plt full) => + " "plt-help*" --- for Help (also built into DrScheme)" \\) + " "setup-plt*" --- command-line setup tool" \\ + " "planet*" --- a command-line helper for for managing third-party " + "libraries" \\ + \\ + (cond full => "This package contains the complete build tree, which " + "includes `cgc' binaries that use a conservative collector." \\ + \\) + +main-src +:= "You must compile MzScheme " (cond (or mr dr plt full) => "and MrEd ") + "before using the "package-name" software" + (cond (or dr plt full) => " (including DrScheme)")"." \\ + \\ + "For compilation instructions, see \"" + (cond win => "plt\\src\\worksp\\README" + else => "plt/src/README") + "\"." \\ +main +:= (cond src => main-src else => main-exe) + +license +:= "License" \\ + "-------" \\ \\ + "PLT Software" \\ + "Copyright (c) 1995-2003 PLT" \\ + "Copyright (c) 2004-2008 PLT Inc." \\ + \\ + "PLT software is distributed under the GNU Lesser General Public " + "License (LGPL). This means you can link PLT software (such as " + "MzScheme or MrEd) into proprietary applications, provided you follow " + "the specific rules stated in the LGPL. You can also modify PLT " + "software; if you distribute a modified version, you must distribute it " + "under the terms of the LGPL, which in particular means that you must " + "release the source code for the modified software. See " + "doc/release-notes/COPYING.LIB for more information." \\ + (cond full => + \\ "Note that this is the "package-name" distribution, which might " + "contain parts that are GPL." \\) + +more-information +:= "More Information" \\ + "----------------" \\ + \\ + "For further information, use DrScheme's `Help' menu, or run "plt-help*". " + "Also, visit http://www.plt-scheme.org/." \\ + +readme +:= intro \\ main \\ \\ license \\ \\ more-information diff --git a/collects/meta/build/sitemap/AUTHORS b/collects/meta/build/sitemap/AUTHORS new file mode 100644 index 0000000000..4858b377c7 --- /dev/null +++ b/collects/meta/build/sitemap/AUTHORS @@ -0,0 +1 @@ +opensource@google.com diff --git a/collects/meta/build/sitemap/COPYING b/collects/meta/build/sitemap/COPYING new file mode 100644 index 0000000000..e26c5fff1e --- /dev/null +++ b/collects/meta/build/sitemap/COPYING @@ -0,0 +1,37 @@ +Copyright (c) 2004, 2005, Google Inc. +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions +are met: + + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following + disclaimer in the documentation and/or other materials provided + with the distribution. + + * Neither the name of Google Inc. nor the names of its contributors + may be used to endorse or promote products derived from this + software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + + +The sitemap_gen.py script is written in Python 2.2 and released to the open +source community for continuous improvements under the BSD 2.0 new license, +which can be found at: + + http://www.opensource.org/licenses/bsd-license.php diff --git a/collects/meta/build/sitemap/ChangeLog b/collects/meta/build/sitemap/ChangeLog new file mode 100644 index 0000000000..8fd659d2ef --- /dev/null +++ b/collects/meta/build/sitemap/ChangeLog @@ -0,0 +1,65 @@ +Wed Jun 01 01:00:00 2005 Google Inc. + + * sitemap_gen: initial release: + This directory contains Python utilities for creating + Sitemaps. + +Mon Jun 13 01:00:00 2005 Google Inc. + + * sitemap_gen.py: v1.1 + + [BIG] + Not blow up when dealing with international character encodings. + + [MODERATE] + Fix platform and Python version issues. In some versions of 2.2 + and certain platforms, True was not defined. Gak! + +Tue Jul 12 01:00:00 2005 Google Inc. + + * sitemap_gen.py: v1.2 + + [MODERATE] + Default_file option added to directory walking + Support for Extended Logfile Format (IIS's log format) + Allow wildcards in the "path" attribute on accesslog and urllist + input methods. + Running on Python 1.5 should exit cleanly with an error message + Stricter processing of configuration files + + [SMALL] + XML files written in "text" mode, so linefeeds are correct + One more Unicode issue fixed: Sitemap filenames with non-ascii + characters had still been problematic + In directory walking, the root URL of the walk now gets included + In directory walking, URLs to directories now have a "/" appended + URLs to files we recognize as our own script's Sitemap output files + are suppressed. + 'suppress_search_engine_notify="0"' now does what you would expect + Default priority on URLs is now 0.5 instead of 1.0 + Priority values written by default to only 4 decimal places + URLs to Sitemap files in the Sitemap index file are now encoded + according to the user's default_encoding, instead of forcing to UTF-8 + +Mon Aug 01 01:00:00 2005 Google Inc. + + * sitemap_gen.py: v1.3 + + [BIG] + input method added. + + [MODERATE] + Use proper IDNA encoding on international domain names. This is + only available on Python2.3 or higher. + + [SMALL] + Fixed Windows bug where directory walking would generate bad URLs on + 2+ deep subdirectories + +Wed Nov 03 01:00:00 2005 Google Inc. + + * sitemap_gen.py: v1.4 + + [SMALL] + Fixed bug where writing a gzipped sitemap would store the server's + file path in the archive. diff --git a/collects/meta/build/sitemap/PKG-INFO b/collects/meta/build/sitemap/PKG-INFO new file mode 100644 index 0000000000..dfa4c8a4f6 --- /dev/null +++ b/collects/meta/build/sitemap/PKG-INFO @@ -0,0 +1,10 @@ +Metadata-Version: 1.0 +Name: sitemap_gen +Version: 1.4 +Summary: Sitemap Generator +Home-page: http://sourceforge.net/projects/goog-sitemapgen/ +Author: Google Inc. +Author-email: opensource@google.com +License: BSD +Description: UNKNOWN +Platform: UNKNOWN diff --git a/collects/meta/build/sitemap/README b/collects/meta/build/sitemap/README new file mode 100644 index 0000000000..e8abdbb150 --- /dev/null +++ b/collects/meta/build/sitemap/README @@ -0,0 +1,25 @@ +sitemap_gen.py + +Version 1.4 + +The sitemap_gen.py script analyzes your web server and generates one or more +Sitemap files. These files are XML listings of content you make available on +your web server. The files can be directly submitted to search engines as +hints for the search engine web crawlers as they index your web site. This +can result in better coverage of your web content in search engine indices, +and less of your bandwidth spent doing it. + +The sitemap_gen.py script is written in Python and released to the open +source community for continuous improvements under the BSD 2.0 new license, +which can be found at: + + http://www.opensource.org/licenses/bsd-license.php + +The original release notes for the script, including a walk-through for +webmasters on how to use it, can be found at the following site: + + http://www.google.com/webmasters/sitemaps/sitemap-generator.html + +The minimum Python version required is Python 2.2. However, if URLs on +your site involve any non-ASCII characters, we strongly recommend +Python 2.3 or later, as it better handles encoding issues. diff --git a/collects/meta/build/sitemap/example_config.xml b/collects/meta/build/sitemap/example_config.xml new file mode 100644 index 0000000000..2e37eaa58b --- /dev/null +++ b/collects/meta/build/sitemap/example_config.xml @@ -0,0 +1,164 @@ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/collects/meta/build/sitemap/example_urllist.txt b/collects/meta/build/sitemap/example_urllist.txt new file mode 100644 index 0000000000..f8192f68d9 --- /dev/null +++ b/collects/meta/build/sitemap/example_urllist.txt @@ -0,0 +1,21 @@ +# To add a list of URLs, make a space-delimited text file. The first +# column contains the URL; then you can specify various optional +# attributes in the form key=value: +# +# lastmod = modification time in ISO8601 (YYYY-MM-DDThh:mm:ss+00:00) +# changefreq = 'always' | 'hourly' | 'daily' | 'weekly' | 'monthly' | +# 'yearly' | 'never' +# priority = priority of the page relative to other pages on the same site; +# a number between 0.0 and 1.0, where 0.0 is the lowest priority +# and 1.0 is the highest priority +# +# Note that all URLs must be part of the site, and therefore must begin with +# the base_url (e.g., 'http://www.example.com/') as specified in config.xml. +# +# Any line beginning with a # is a comment. +# +# Example contents of the file: +# +# http://www.example.com/foo/bar +# http://www.example.com/foo/xxx.pdf lastmod=2003-12-31T14:05:06+00:00 +# http://www.example.com/foo/yyy?x=12&y=23 changefreq=weekly priority=0.3 diff --git a/collects/meta/build/sitemap/plt-pre.xml b/collects/meta/build/sitemap/plt-pre.xml new file mode 100644 index 0000000000..9305fc2174 --- /dev/null +++ b/collects/meta/build/sitemap/plt-pre.xml @@ -0,0 +1,16 @@ + + + + + + + + + + + + diff --git a/collects/meta/build/sitemap/setup.py b/collects/meta/build/sitemap/setup.py new file mode 100755 index 0000000000..fa703595d3 --- /dev/null +++ b/collects/meta/build/sitemap/setup.py @@ -0,0 +1,12 @@ +#!/usr/bin/env python + +from distutils.core import setup + +setup(name='sitemap_gen', + version='1.4', + description='Sitemap Generator', + license='BSD', + author='Google Inc.', + author_email='opensource@google.com', + url='http://sourceforge.net/projects/goog-sitemapgen/', + ) diff --git a/collects/meta/build/sitemap/sitemap_gen.py b/collects/meta/build/sitemap/sitemap_gen.py new file mode 100755 index 0000000000..cbcfd6f593 --- /dev/null +++ b/collects/meta/build/sitemap/sitemap_gen.py @@ -0,0 +1,2205 @@ +#!/usr/bin/env python +# +# Copyright (c) 2004, 2005 Google Inc. +# All rights reserved. +# +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions +# are met: +# +# * Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. +# +# * Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. +# +# * Neither the name of Google nor the names of its contributors may +# be used to endorse or promote products derived from this software +# without specific prior written permission. +# +# THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS +# FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE +# COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, +# INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, +# BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; +# LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +# CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT +# LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN +# ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. +# +# +# The sitemap_gen.py script is written in Python 2.2 and released to +# the open source community for continuous improvements under the BSD +# 2.0 new license, which can be found at: +# +# http://www.opensource.org/licenses/bsd-license.php +# + +__usage__ = \ +"""A simple script to automatically produce sitemaps for a webserver, +in the Google Sitemap Protocol (GSP). + +Usage: python sitemap_gen.py --config=config.xml [--help] [--testing] + --config=config.xml, specifies config file location + --help, displays usage message + --testing, specified when user is experimenting +""" + +# Please be careful that all syntax used in this file can be parsed on +# Python 1.5 -- this version check is not evaluated until after the +# entire file has been parsed. +import sys +if sys.hexversion < 0x02020000: + print 'This script requires Python 2.2 or later.' + print 'Currently run with version: %s' % sys.version + sys.exit(1) + +import fnmatch +import glob +import gzip +import md5 +import os +import re +import stat +import time +import types +import urllib +import urlparse +import xml.sax + +# True and False were introduced in Python2.2.2 +try: + testTrue=True + del testTrue +except NameError: + True=1 + False=0 + +# Text encodings +ENC_ASCII = 'ASCII' +ENC_UTF8 = 'UTF-8' +ENC_IDNA = 'IDNA' +ENC_ASCII_LIST = ['ASCII', 'US-ASCII', 'US', 'IBM367', 'CP367', 'ISO646-US' + 'ISO_646.IRV:1991', 'ISO-IR-6', 'ANSI_X3.4-1968', + 'ANSI_X3.4-1986', 'CPASCII' ] +ENC_DEFAULT_LIST = ['ISO-8859-1', 'ISO-8859-2', 'ISO-8859-5'] + +# Maximum number of urls in each sitemap, before next Sitemap is created +MAXURLS_PER_SITEMAP = 50000 + +# Suffix on a Sitemap index file +SITEINDEX_SUFFIX = '_index.xml' + +# Regular expressions tried for extracting URLs from access logs. +ACCESSLOG_CLF_PATTERN = re.compile( + r'.+\s+"([^\s]+)\s+([^\s]+)\s+HTTP/\d+\.\d+"\s+200\s+.*' + ) + +# Match patterns for lastmod attributes +LASTMOD_PATTERNS = map(re.compile, [ + r'^\d\d\d\d$', + r'^\d\d\d\d-\d\d$', + r'^\d\d\d\d-\d\d-\d\d$', + r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\dZ$', + r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d[+-]\d\d:\d\d$', + r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?Z$', + r'^\d\d\d\d-\d\d-\d\dT\d\d:\d\d:\d\d(\.\d+)?[+-]\d\d:\d\d$', + ]) + +# Match patterns for changefreq attributes +CHANGEFREQ_PATTERNS = [ + 'always', 'hourly', 'daily', 'weekly', 'monthly', 'yearly', 'never' + ] + +# XML formats +SITEINDEX_HEADER = \ + '\n' \ + '\n' +SITEINDEX_FOOTER = '\n' +SITEINDEX_ENTRY = \ + ' \n' \ + ' %(loc)s\n' \ + ' %(lastmod)s\n' \ + ' \n' +SITEMAP_HEADER = \ + '\n' \ + '\n' +SITEMAP_FOOTER = '\n' +SITEURL_XML_PREFIX = ' \n' +SITEURL_XML_SUFFIX = ' \n' + +# Search engines to notify with the updated sitemaps +# +# This list is very non-obvious in what's going on. Here's the gist: +# Each item in the list is a 6-tuple of items. The first 5 are "almost" +# the same as the input arguments to urlparse.urlunsplit(): +# 0 - schema +# 1 - netloc +# 2 - path +# 3 - query <-- EXCEPTION: specify a query map rather than a string +# 4 - fragment +# Additionally, add item 5: +# 5 - query attribute that should be set to the new Sitemap URL +# Clear as mud, I know. +NOTIFICATION_SITES = [ + ('http', 'www.google.com', 'webmasters/sitemaps/ping', {}, '', 'sitemap') + ] + + +class Error(Exception): + """ + Base exception class. In this module we tend not to use our own exception + types for very much, but they come in very handy on XML parsing with SAX. + """ + pass +#end class Error + + +class SchemaError(Error): + """Failure to process an XML file according to the schema we know.""" + pass +#end class SchemeError + + +class Encoder: + """ + Manages wide-character/narrow-character conversions for just about all + text that flows into or out of the script. + + You should always use this class for string coercion, as opposed to + letting Python handle coercions automatically. Reason: Python + usually assumes ASCII (7-bit) as a default narrow character encoding, + which is not the kind of data we generally deal with. + + General high-level methodologies used in sitemap_gen: + + [PATHS] + File system paths may be wide or narrow, depending on platform. + This works fine, just be aware of it and be very careful to not + mix them. That is, if you have to pass several file path arguments + into a library call, make sure they are all narrow or all wide. + This class has MaybeNarrowPath() which should be called on every + file system path you deal with. + + [URLS] + URL locations are stored in Narrow form, already escaped. This has the + benefit of keeping escaping and encoding as close as possible to the format + we read them in. The downside is we may end up with URLs that have + intermingled encodings -- the root path may be encoded in one way + while the filename is encoded in another. This is obviously wrong, but + it should hopefully be an issue hit by very few users. The workaround + from the user level (assuming they notice) is to specify a default_encoding + parameter in their config file. + + [OTHER] + Other text, such as attributes of the URL class, configuration options, + etc, are generally stored in Unicode for simplicity. + """ + + def __init__(self): + self._user = None # User-specified default encoding + self._learned = [] # Learned default encodings + self._widefiles = False # File system can be wide + + # Can the file system be Unicode? + try: + self._widefiles = os.path.supports_unicode_filenames + except AttributeError: + try: + self._widefiles = sys.getwindowsversion() == os.VER_PLATFORM_WIN32_NT + except AttributeError: + pass + + # Try to guess a working default + try: + encoding = sys.getfilesystemencoding() + if encoding and not (encoding.upper() in ENC_ASCII_LIST): + self._learned = [ encoding ] + except AttributeError: + pass + + if not self._learned: + encoding = sys.getdefaultencoding() + if encoding and not (encoding.upper() in ENC_ASCII_LIST): + self._learned = [ encoding ] + + # If we had no guesses, start with some European defaults + if not self._learned: + self._learned = ENC_DEFAULT_LIST + #end def __init__ + + def SetUserEncoding(self, encoding): + self._user = encoding + #end def SetUserEncoding + + def NarrowText(self, text, encoding): + """ Narrow a piece of arbitrary text """ + if type(text) != types.UnicodeType: + return text + + # Try the passed in preference + if encoding: + try: + result = text.encode(encoding) + if not encoding in self._learned: + self._learned.append(encoding) + return result + except UnicodeError: + pass + except LookupError: + output.Warn('Unknown encoding: %s' % encoding) + + # Try the user preference + if self._user: + try: + return text.encode(self._user) + except UnicodeError: + pass + except LookupError: + temp = self._user + self._user = None + output.Warn('Unknown default_encoding: %s' % temp) + + # Look through learned defaults, knock any failing ones out of the list + while self._learned: + try: + return text.encode(self._learned[0]) + except: + del self._learned[0] + + # When all other defaults are exhausted, use UTF-8 + try: + return text.encode(ENC_UTF8) + except UnicodeError: + pass + + # Something is seriously wrong if we get to here + return text.encode(ENC_ASCII, 'ignore') + #end def NarrowText + + def MaybeNarrowPath(self, text): + """ Paths may be allowed to stay wide """ + if self._widefiles: + return text + return self.NarrowText(text, None) + #end def MaybeNarrowPath + + def WidenText(self, text, encoding): + """ Widen a piece of arbitrary text """ + if type(text) != types.StringType: + return text + + # Try the passed in preference + if encoding: + try: + result = unicode(text, encoding) + if not encoding in self._learned: + self._learned.append(encoding) + return result + except UnicodeError: + pass + except LookupError: + output.Warn('Unknown encoding: %s' % encoding) + + # Try the user preference + if self._user: + try: + return unicode(text, self._user) + except UnicodeError: + pass + except LookupError: + temp = self._user + self._user = None + output.Warn('Unknown default_encoding: %s' % temp) + + # Look through learned defaults, knock any failing ones out of the list + while self._learned: + try: + return unicode(text, self._learned[0]) + except: + del self._learned[0] + + # When all other defaults are exhausted, use UTF-8 + try: + return unicode(text, ENC_UTF8) + except UnicodeError: + pass + + # Getting here means it wasn't UTF-8 and we had no working default. + # We really don't have anything "right" we can do anymore. + output.Warn('Unrecognized encoding in text: %s' % text) + if not self._user: + output.Warn('You may need to set a default_encoding in your ' + 'configuration file.') + return text.decode(ENC_ASCII, 'ignore') + #end def WidenText +#end class Encoder +encoder = Encoder() + + +class Output: + """ + Exposes logging functionality, and tracks how many errors + we have thus output. + + Logging levels should be used as thus: + Fatal -- extremely sparingly + Error -- config errors, entire blocks of user 'intention' lost + Warn -- individual URLs lost + Log(,0) -- Un-suppressable text that's not an error + Log(,1) -- touched files, major actions + Log(,2) -- parsing notes, filtered or duplicated URLs + Log(,3) -- each accepted URL + """ + + def __init__(self): + self.num_errors = 0 # Count of errors + self.num_warns = 0 # Count of warnings + + self._errors_shown = {} # Shown errors + self._warns_shown = {} # Shown warnings + self._verbose = 0 # Level of verbosity + #end def __init__ + + def Log(self, text, level): + """ Output a blurb of diagnostic text, if the verbose level allows it """ + if text: + text = encoder.NarrowText(text, None) + if self._verbose >= level: + print text + #end def Log + + def Warn(self, text): + """ Output and count a warning. Suppress duplicate warnings. """ + if text: + text = encoder.NarrowText(text, None) + hash = md5.new(text).digest() + if not self._warns_shown.has_key(hash): + self._warns_shown[hash] = 1 + print '[WARNING] ' + text + else: + self.Log('(suppressed) [WARNING] ' + text, 3) + self.num_warns = self.num_warns + 1 + #end def Warn + + def Error(self, text): + """ Output and count an error. Suppress duplicate errors. """ + if text: + text = encoder.NarrowText(text, None) + hash = md5.new(text).digest() + if not self._errors_shown.has_key(hash): + self._errors_shown[hash] = 1 + print '[ERROR] ' + text + else: + self.Log('(suppressed) [ERROR] ' + text, 3) + self.num_errors = self.num_errors + 1 + #end def Error + + def Fatal(self, text): + """ Output an error and terminate the program. """ + if text: + text = encoder.NarrowText(text, None) + print '[FATAL] ' + text + else: + print 'Fatal error.' + sys.exit(1) + #end def Fatal + + def SetVerbose(self, level): + """ Sets the verbose level. """ + try: + if type(level) != types.IntType: + level = int(level) + if (level >= 0) and (level <= 3): + self._verbose = level + return + except ValueError: + pass + self.Error('Verbose level (%s) must be between 0 and 3 inclusive.' % level) + #end def SetVerbose +#end class Output +output = Output() + + +class URL(object): + """ URL is a smart structure grouping together the properties we + care about for a single web reference. """ + __slots__ = 'loc', 'lastmod', 'changefreq', 'priority' + + def __init__(self): + self.loc = None # URL -- in Narrow characters + self.lastmod = None # ISO8601 timestamp of last modify + self.changefreq = None # Text term for update frequency + self.priority = None # Float between 0 and 1 (inc) + #end def __init__ + + def __cmp__(self, other): + if self.loc < other.loc: + return -1 + if self.loc > other.loc: + return 1 + return 0 + #end def __cmp__ + + def TrySetAttribute(self, attribute, value): + """ Attempt to set the attribute to the value, with a pretty try + block around it. """ + if attribute == 'loc': + self.loc = self.Canonicalize(value) + else: + try: + setattr(self, attribute, value) + except AttributeError: + output.Warn('Unknown URL attribute: %s' % attribute) + #end def TrySetAttribute + + def IsAbsolute(loc): + """ Decide if the URL is absolute or not """ + if not loc: + return False + narrow = encoder.NarrowText(loc, None) + (scheme, netloc, path, query, frag) = urlparse.urlsplit(narrow) + if (not scheme) or (not netloc): + return False + return True + #end def IsAbsolute + IsAbsolute = staticmethod(IsAbsolute) + + def Canonicalize(loc): + """ Do encoding and canonicalization on a URL string """ + if not loc: + return loc + + # Let the encoder try to narrow it + narrow = encoder.NarrowText(loc, None) + + # Escape components individually + (scheme, netloc, path, query, frag) = urlparse.urlsplit(narrow) + unr = '-._~' + sub = '!$&\'()*+,;=' + netloc = urllib.quote(netloc, unr + sub + '%:@/[]') + path = urllib.quote(path, unr + sub + '%:@/') + query = urllib.quote(query, unr + sub + '%:@/?') + frag = urllib.quote(frag, unr + sub + '%:@/?') + + # Try built-in IDNA encoding on the netloc + try: + (ignore, widenetloc, ignore, ignore, ignore) = urlparse.urlsplit(loc) + for c in widenetloc: + if c >= unichr(128): + netloc = widenetloc.encode(ENC_IDNA) + netloc = urllib.quote(netloc, unr + sub + '%:@/[]') + break + except UnicodeError: + # urlsplit must have failed, based on implementation differences in the + # library. There is not much we can do here, except ignore it. + pass + except LookupError: + output.Warn('An International Domain Name (IDN) is being used, but this ' + 'version of Python does not have support for IDNA encoding. ' + ' (IDNA support was introduced in Python 2.3) The encoding ' + 'we have used instead is wrong and will probably not yield ' + 'valid URLs.') + bad_netloc = False + if '%' in netloc: + bad_netloc = True + + # Put it all back together + narrow = urlparse.urlunsplit((scheme, netloc, path, query, frag)) + + # I let '%' through. Fix any that aren't pre-existing escapes. + HEXDIG = '0123456789abcdefABCDEF' + list = narrow.split('%') + narrow = list[0] + del list[0] + for item in list: + if (len(item) >= 2) and (item[0] in HEXDIG) and (item[1] in HEXDIG): + narrow = narrow + '%' + item + else: + narrow = narrow + '%25' + item + + # Issue a warning if this is a bad URL + if bad_netloc: + output.Warn('Invalid characters in the host or domain portion of a URL: ' + + narrow) + + return narrow + #end def Canonicalize + Canonicalize = staticmethod(Canonicalize) + + def Validate(self, base_url, allow_fragment): + """ Verify the data in this URL is well-formed, and override if not. """ + assert type(base_url) == types.StringType + + # Test (and normalize) the ref + if not self.loc: + output.Warn('Empty URL') + return False + if allow_fragment: + self.loc = urlparse.urljoin(base_url, self.loc) + if not self.loc.startswith(base_url): + output.Warn('Discarded URL for not starting with the base_url: %s' % + self.loc) + self.loc = None + return False + + # Test the lastmod + if self.lastmod: + match = False + self.lastmod = self.lastmod.upper() + for pattern in LASTMOD_PATTERNS: + match = pattern.match(self.lastmod) + if match: + break + if not match: + output.Warn('Lastmod "%s" does not appear to be in ISO8601 format on ' + 'URL: %s' % (self.lastmod, self.loc)) + self.lastmod = None + + # Test the changefreq + if self.changefreq: + match = False + self.changefreq = self.changefreq.lower() + for pattern in CHANGEFREQ_PATTERNS: + if self.changefreq == pattern: + match = True + break + if not match: + output.Warn('Changefreq "%s" is not a valid change frequency on URL ' + ': %s' % (self.changefreq, self.loc)) + self.changefreq = None + + # Test the priority + if self.priority: + priority = -1.0 + try: + priority = float(self.priority) + except ValueError: + pass + if (priority < 0.0) or (priority > 1.0): + output.Warn('Priority "%s" is not a number between 0 and 1 inclusive ' + 'on URL: %s' % (self.priority, self.loc)) + self.priority = None + + return True + #end def Validate + + def MakeHash(self): + """ Provides a uniform way of hashing URLs """ + if not self.loc: + return None + if self.loc.endswith('/'): + return md5.new(self.loc[:-1]).digest() + return md5.new(self.loc).digest() + #end def MakeHash + + def Log(self, prefix='URL', level=3): + """ Dump the contents, empty or not, to the log. """ + out = prefix + ':' + + for attribute in self.__slots__: + value = getattr(self, attribute) + if not value: + value = '' + out = out + (' %s=[%s]' % (attribute, value)) + + output.Log('%s' % encoder.NarrowText(out, None), level) + #end def Log + + def WriteXML(self, file): + """ Dump non-empty contents to the output file, in XML format. """ + if not self.loc: + return + out = SITEURL_XML_PREFIX + + for attribute in self.__slots__: + value = getattr(self, attribute) + if value: + if type(value) == types.UnicodeType: + value = encoder.NarrowText(value, None) + elif type(value) != types.StringType: + value = str(value) + value = xml.sax.saxutils.escape(value) + out = out + (' <%s>%s\n' % (attribute, value, attribute)) + + out = out + SITEURL_XML_SUFFIX + file.write(out) + #end def WriteXML +#end class URL + + +class Filter: + """ + A filter on the stream of URLs we find. A filter is, in essence, + a wildcard applied to the stream. You can think of this as an + operator that returns a tri-state when given a URL: + + True -- this URL is to be included in the sitemap + None -- this URL is undecided + False -- this URL is to be dropped from the sitemap + """ + + def __init__(self, attributes): + self._wildcard = None # Pattern for wildcard match + self._regexp = None # Pattern for regexp match + self._pass = False # "Drop" filter vs. "Pass" filter + + if not ValidateAttributes('FILTER', attributes, + ('pattern', 'type', 'action')): + return + + # Check error count on the way in + num_errors = output.num_errors + + # Fetch the attributes + pattern = attributes.get('pattern') + type = attributes.get('type', 'wildcard') + action = attributes.get('action', 'drop') + if type: + type = type.lower() + if action: + action = action.lower() + + # Verify the attributes + if not pattern: + output.Error('On a filter you must specify a "pattern" to match') + elif (not type) or ((type != 'wildcard') and (type != 'regexp')): + output.Error('On a filter you must specify either \'type="wildcard"\' ' + 'or \'type="regexp"\'') + elif (action != 'pass') and (action != 'drop'): + output.Error('If you specify a filter action, it must be either ' + '\'action="pass"\' or \'action="drop"\'') + + # Set the rule + if action == 'drop': + self._pass = False + elif action == 'pass': + self._pass = True + + if type == 'wildcard': + self._wildcard = pattern + elif type == 'regexp': + try: + self._regexp = re.compile(pattern) + except re.error: + output.Error('Bad regular expression: %s' % pattern) + + # Log the final results iff we didn't add any errors + if num_errors == output.num_errors: + output.Log('Filter: %s any URL that matches %s "%s"' % + (action, type, pattern), 2) + #end def __init__ + + def Apply(self, url): + """ Process the URL, as above. """ + if (not url) or (not url.loc): + return None + + if self._wildcard: + if fnmatch.fnmatchcase(url.loc, self._wildcard): + return self._pass + return None + + if self._regexp: + if self._regexp.search(url.loc): + return self._pass + return None + + assert False # unreachable + #end def Apply +#end class Filter + + +class InputURL: + """ + Each Input class knows how to yield a set of URLs from a data source. + + This one handles a single URL, manually specified in the config file. + """ + + def __init__(self, attributes): + self._url = None # The lonely URL + + if not ValidateAttributes('URL', attributes, + ('href', 'lastmod', 'changefreq', 'priority')): + return + + url = URL() + for attr in attributes.keys(): + if attr == 'href': + url.TrySetAttribute('loc', attributes[attr]) + else: + url.TrySetAttribute(attr, attributes[attr]) + + if not url.loc: + output.Error('Url entries must have an href attribute.') + return + + self._url = url + output.Log('Input: From URL "%s"' % self._url.loc, 2) + #end def __init__ + + def ProduceURLs(self, consumer): + """ Produces URLs from our data source, hands them in to the consumer. """ + if self._url: + consumer(self._url, True) + #end def ProduceURLs +#end class InputURL + + +class InputURLList: + """ + Each Input class knows how to yield a set of URLs from a data source. + + This one handles a text file with a list of URLs + """ + + def __init__(self, attributes): + self._path = None # The file path + self._encoding = None # Encoding of that file + + if not ValidateAttributes('URLLIST', attributes, ('path', 'encoding')): + return + + self._path = attributes.get('path') + self._encoding = attributes.get('encoding', ENC_UTF8) + if self._path: + self._path = encoder.MaybeNarrowPath(self._path) + if os.path.isfile(self._path): + output.Log('Input: From URLLIST "%s"' % self._path, 2) + else: + output.Error('Can not locate file: %s' % self._path) + self._path = None + else: + output.Error('Urllist entries must have a "path" attribute.') + #end def __init__ + + def ProduceURLs(self, consumer): + """ Produces URLs from our data source, hands them in to the consumer. """ + + # Open the file + (frame, file) = OpenFileForRead(self._path, 'URLLIST') + if not file: + return + + # Iterate lines + linenum = 0 + for line in file.readlines(): + linenum = linenum + 1 + + # Strip comments and empty lines + if self._encoding: + line = encoder.WidenText(line, self._encoding) + line = line.strip() + if (not line) or line[0] == '#': + continue + + # Split the line on space + url = URL() + cols = line.split(' ') + for i in range(0,len(cols)): + cols[i] = cols[i].strip() + url.TrySetAttribute('loc', cols[0]) + + # Extract attributes from the other columns + for i in range(1,len(cols)): + if cols[i]: + try: + (attr_name, attr_val) = cols[i].split('=', 1) + url.TrySetAttribute(attr_name, attr_val) + except ValueError: + output.Warn('Line %d: Unable to parse attribute: %s' % + (linenum, cols[i])) + + # Pass it on + consumer(url, False) + + file.close() + if frame: + frame.close() + #end def ProduceURLs +#end class InputURLList + + +class InputDirectory: + """ + Each Input class knows how to yield a set of URLs from a data source. + + This one handles a directory that acts as base for walking the filesystem. + """ + + def __init__(self, attributes, base_url): + self._path = None # The directory + self._url = None # The URL equivelant + self._default_file = None + + if not ValidateAttributes('DIRECTORY', attributes, ('path', 'url', + 'default_file')): + return + + # Prep the path -- it MUST end in a sep + path = attributes.get('path') + if not path: + output.Error('Directory entries must have both "path" and "url" ' + 'attributes') + return + path = encoder.MaybeNarrowPath(path) + if not path.endswith(os.sep): + path = path + os.sep + if not os.path.isdir(path): + output.Error('Can not locate directory: %s' % path) + return + + # Prep the URL -- it MUST end in a sep + url = attributes.get('url') + if not url: + output.Error('Directory entries must have both "path" and "url" ' + 'attributes') + return + url = URL.Canonicalize(url) + if not url.endswith('/'): + url = url + '/' + if not url.startswith(base_url): + url = urlparse.urljoin(base_url, url) + if not url.startswith(base_url): + output.Error('The directory URL "%s" is not relative to the ' + 'base_url: %s' % (url, base_url)) + return + + # Prep the default file -- it MUST be just a filename + file = attributes.get('default_file') + if file: + file = encoder.MaybeNarrowPath(file) + if os.sep in file: + output.Error('The default_file "%s" can not include path information.' + % file) + file = None + + self._path = path + self._url = url + self._default_file = file + if file: + output.Log('Input: From DIRECTORY "%s" (%s) with default file "%s"' + % (path, url, file), 2) + else: + output.Log('Input: From DIRECTORY "%s" (%s) with no default file' + % (path, url), 2) + #end def __init__ + + def ProduceURLs(self, consumer): + """ Produces URLs from our data source, hands them in to the consumer. """ + if not self._path: + return + + root_path = self._path + root_URL = self._url + root_file = self._default_file + + def PerFile(dirpath, name): + """ + Called once per file. + Note that 'name' will occasionally be None -- for a directory itself + """ + # Pull a timestamp + url = URL() + isdir = False + try: + if name: + path = os.path.join(dirpath, name) + else: + path = dirpath + isdir = os.path.isdir(path) + time = None + if isdir and root_file: + file = os.path.join(path, root_file) + try: + time = os.stat(file)[stat.ST_MTIME]; + except OSError: + pass + if not time: + time = os.stat(path)[stat.ST_MTIME]; + url.lastmod = TimestampISO8601(time) + except OSError: + pass + except ValueError: + pass + + # Build a URL + middle = dirpath[len(root_path):] + if os.sep != '/': + middle = middle.replace(os.sep, '/') + if middle: + middle = middle + '/' + if name: + middle = middle + name + if isdir: + middle = middle + '/' + url.TrySetAttribute('loc', root_URL + encoder.WidenText(middle, None)) + + # Suppress default files. (All the way down here so we can log it.) + if name and (root_file == name): + url.Log(prefix='IGNORED (default file)', level=2) + return + + consumer(url, False) + #end def PerFile + + def PerDirectory(ignore, dirpath, namelist): + """ + Called once per directory with a list of all the contained files/dirs. + """ + ignore = ignore # Avoid warnings of an unused parameter + + if not dirpath.startswith(root_path): + output.Warn('Unable to decide what the root path is for directory: ' + '%s' % dirpath) + return + + for name in namelist: + PerFile(dirpath, name) + #end def PerDirectory + + output.Log('Walking DIRECTORY "%s"' % self._path, 1) + PerFile(self._path, None) + os.path.walk(self._path, PerDirectory, None) + #end def ProduceURLs +#end class InputDirectory + + +class InputAccessLog: + """ + Each Input class knows how to yield a set of URLs from a data source. + + This one handles access logs. It's non-trivial in that we want to + auto-detect log files in the Common Logfile Format (as used by Apache, + for instance) and the Extended Log File Format (as used by IIS, for + instance). + """ + + def __init__(self, attributes): + self._path = None # The file path + self._encoding = None # Encoding of that file + self._is_elf = False # Extended Log File Format? + self._is_clf = False # Common Logfile Format? + self._elf_status = -1 # ELF field: '200' + self._elf_method = -1 # ELF field: 'HEAD' + self._elf_uri = -1 # ELF field: '/foo?bar=1' + self._elf_urifrag1 = -1 # ELF field: '/foo' + self._elf_urifrag2 = -1 # ELF field: 'bar=1' + + if not ValidateAttributes('ACCESSLOG', attributes, ('path', 'encoding')): + return + + self._path = attributes.get('path') + self._encoding = attributes.get('encoding', ENC_UTF8) + if self._path: + self._path = encoder.MaybeNarrowPath(self._path) + if os.path.isfile(self._path): + output.Log('Input: From ACCESSLOG "%s"' % self._path, 2) + else: + output.Error('Can not locate file: %s' % self._path) + self._path = None + else: + output.Error('Accesslog entries must have a "path" attribute.') + #end def __init__ + + def RecognizeELFLine(self, line): + """ Recognize the Fields directive that heads an ELF file """ + if not line.startswith('#Fields:'): + return False + fields = line.split(' ') + del fields[0] + for i in range(0, len(fields)): + field = fields[i].strip() + if field == 'sc-status': + self._elf_status = i + elif field == 'cs-method': + self._elf_method = i + elif field == 'cs-uri': + self._elf_uri = i + elif field == 'cs-uri-stem': + self._elf_urifrag1 = i + elif field == 'cs-uri-query': + self._elf_urifrag2 = i + output.Log('Recognized an Extended Log File Format file.', 2) + return True + #end def RecognizeELFLine + + def GetELFLine(self, line): + """ Fetch the requested URL from an ELF line """ + fields = line.split(' ') + count = len(fields) + + # Verify status was Ok + if self._elf_status >= 0: + if self._elf_status >= count: + return None + if not fields[self._elf_status].strip() == '200': + return None + + # Verify method was HEAD or GET + if self._elf_method >= 0: + if self._elf_method >= count: + return None + if not fields[self._elf_method].strip() in ('HEAD', 'GET'): + return None + + # Pull the full URL if we can + if self._elf_uri >= 0: + if self._elf_uri >= count: + return None + url = fields[self._elf_uri].strip() + if url != '-': + return url + + # Put together a fragmentary URL + if self._elf_urifrag1 >= 0: + if self._elf_urifrag1 >= count or self._elf_urifrag2 >= count: + return None + urlfrag1 = fields[self._elf_urifrag1].strip() + urlfrag2 = None + if self._elf_urifrag2 >= 0: + urlfrag2 = fields[self._elf_urifrag2] + if urlfrag1 and (urlfrag1 != '-'): + if urlfrag2 and (urlfrag2 != '-'): + urlfrag1 = urlfrag1 + '?' + urlfrag2 + return urlfrag1 + + return None + #end def GetELFLine + + def RecognizeCLFLine(self, line): + """ Try to tokenize a logfile line according to CLF pattern and see if + it works. """ + match = ACCESSLOG_CLF_PATTERN.match(line) + recognize = match and (match.group(1) in ('HEAD', 'GET')) + if recognize: + output.Log('Recognized a Common Logfile Format file.', 2) + return recognize + #end def RecognizeCLFLine + + def GetCLFLine(self, line): + """ Fetch the requested URL from a CLF line """ + match = ACCESSLOG_CLF_PATTERN.match(line) + if match: + request = match.group(1) + if request in ('HEAD', 'GET'): + return match.group(2) + return None + #end def GetCLFLine + + def ProduceURLs(self, consumer): + """ Produces URLs from our data source, hands them in to the consumer. """ + + # Open the file + (frame, file) = OpenFileForRead(self._path, 'ACCESSLOG') + if not file: + return + + # Iterate lines + for line in file.readlines(): + if self._encoding: + line = encoder.WidenText(line, self._encoding) + line = line.strip() + + # If we don't know the format yet, try them both + if (not self._is_clf) and (not self._is_elf): + self._is_elf = self.RecognizeELFLine(line) + self._is_clf = self.RecognizeCLFLine(line) + + # Digest the line + match = None + if self._is_elf: + match = self.GetELFLine(line) + elif self._is_clf: + match = self.GetCLFLine(line) + if not match: + continue + + # Pass it on + url = URL() + url.TrySetAttribute('loc', match) + consumer(url, True) + + file.close() + if frame: + frame.close() + #end def ProduceURLs +#end class InputAccessLog + + +class InputSitemap(xml.sax.handler.ContentHandler): + + """ + Each Input class knows how to yield a set of URLs from a data source. + + This one handles Sitemap files and Sitemap index files. For the sake + of simplicity in design (and simplicity in interfacing with the SAX + package), we do not handle these at the same time, recursively. Instead + we read an index file completely and make a list of Sitemap files, then + go back and process each Sitemap. + """ + + class _ContextBase(object): + + """Base class for context handlers in our SAX processing. A context + handler is a class that is responsible for understanding one level of + depth in the XML schema. The class knows what sub-tags are allowed, + and doing any processing specific for the tag we're in. + + This base class is the API filled in by specific context handlers, + all defined below. + """ + + def __init__(self, subtags): + """Initialize with a sequence of the sub-tags that would be valid in + this context.""" + self._allowed_tags = subtags # Sequence of sub-tags we can have + self._last_tag = None # Most recent seen sub-tag + #end def __init__ + + def AcceptTag(self, tag): + """Returns True iff opening a sub-tag is valid in this context.""" + valid = tag in self._allowed_tags + if valid: + self._last_tag = tag + else: + self._last_tag = None + return valid + #end def AcceptTag + + def AcceptText(self, text): + """Returns True iff a blurb of text is valid in this context.""" + return False + #end def AcceptText + + def Open(self): + """The context is opening. Do initialization.""" + pass + #end def Open + + def Close(self): + """The context is closing. Return our result, if any.""" + pass + #end def Close + + def Return(self, result): + """We're returning to this context after handling a sub-tag. This + method is called with the result data from the sub-tag that just + closed. Here in _ContextBase, if we ever see a result it means + the derived child class forgot to override this method.""" + if result: + raise NotImplementedError + #end def Return + #end class _ContextBase + + class _ContextUrlSet(_ContextBase): + + """Context handler for the document node in a Sitemap.""" + + def __init__(self): + InputSitemap._ContextBase.__init__(self, ('url',)) + #end def __init__ + #end class _ContextUrlSet + + class _ContextUrl(_ContextBase): + + """Context handler for a URL node in a Sitemap.""" + + def __init__(self, consumer): + """Initialize this context handler with the callable consumer that + wants our URLs.""" + InputSitemap._ContextBase.__init__(self, URL.__slots__) + self._url = None # The URL object we're building + self._consumer = consumer # Who wants to consume it + #end def __init__ + + def Open(self): + """Initialize the URL.""" + assert not self._url + self._url = URL() + #end def Open + + def Close(self): + """Pass the URL to the consumer and reset it to None.""" + assert self._url + self._consumer(self._url, False) + self._url = None + #end def Close + + def Return(self, result): + """A value context has closed, absorb the data it gave us.""" + assert self._url + if result: + self._url.TrySetAttribute(self._last_tag, result) + #end def Return + #end class _ContextUrl + + class _ContextSitemapIndex(_ContextBase): + + """Context handler for the document node in an index file.""" + + def __init__(self): + InputSitemap._ContextBase.__init__(self, ('sitemap',)) + self._loclist = [] # List of accumulated Sitemap URLs + #end def __init__ + + def Open(self): + """Just a quick verify of state.""" + assert not self._loclist + #end def Open + + def Close(self): + """Return our list of accumulated URLs.""" + if self._loclist: + temp = self._loclist + self._loclist = [] + return temp + #end def Close + + def Return(self, result): + """Getting a new loc URL, add it to the collection.""" + if result: + self._loclist.append(result) + #end def Return + #end class _ContextSitemapIndex + + class _ContextSitemap(_ContextBase): + + """Context handler for a Sitemap entry in an index file.""" + + def __init__(self): + InputSitemap._ContextBase.__init__(self, ('loc', 'lastmod')) + self._loc = None # The URL to the Sitemap + #end def __init__ + + def Open(self): + """Just a quick verify of state.""" + assert not self._loc + #end def Open + + def Close(self): + """Return our URL to our parent.""" + if self._loc: + temp = self._loc + self._loc = None + return temp + output.Warn('In the Sitemap index file, a "sitemap" entry had no "loc".') + #end def Close + + def Return(self, result): + """A value has closed. If it was a 'loc', absorb it.""" + if result and (self._last_tag == 'loc'): + self._loc = result + #end def Return + #end class _ContextSitemap + + class _ContextValue(_ContextBase): + + """Context handler for a single value. We return just the value. The + higher level context has to remember what tag led into us.""" + + def __init__(self): + InputSitemap._ContextBase.__init__(self, ()) + self._text = None + #end def __init__ + + def AcceptText(self, text): + """Allow all text, adding it to our buffer.""" + if self._text: + self._text = self._text + text + else: + self._text = text + return True + #end def AcceptText + + def Open(self): + """Initialize our buffer.""" + self._text = None + #end def Open + + def Close(self): + """Return what's in our buffer.""" + text = self._text + self._text = None + if text: + text = text.strip() + return text + #end def Close + #end class _ContextValue + + def __init__(self, attributes): + """Initialize with a dictionary of attributes from our entry in the + config file.""" + xml.sax.handler.ContentHandler.__init__(self) + self._pathlist = None # A list of files + self._current = -1 # Current context in _contexts + self._contexts = None # The stack of contexts we allow + self._contexts_idx = None # ...contexts for index files + self._contexts_stm = None # ...contexts for Sitemap files + + if not ValidateAttributes('SITEMAP', attributes, ['path']): + return + + # Init the first file path + path = attributes.get('path') + if path: + path = encoder.MaybeNarrowPath(path) + if os.path.isfile(path): + output.Log('Input: From SITEMAP "%s"' % path, 2) + self._pathlist = [path] + else: + output.Error('Can not locate file "%s"' % path) + else: + output.Error('Sitemap entries must have a "path" attribute.') + #end def __init__ + + def ProduceURLs(self, consumer): + """In general: Produces URLs from our data source, hand them to the + callable consumer. + + In specific: Iterate over our list of paths and delegate the actual + processing to helper methods. This is a complexity no other data source + needs to suffer. We are unique in that we can have files that tell us + to bring in other files. + + Note the decision to allow an index file or not is made in this method. + If we call our parser with (self._contexts == None) the parser will + grab whichever context stack can handle the file. IE: index is allowed. + If instead we set (self._contexts = ...) before parsing, the parser + will only use the stack we specify. IE: index not allowed. + """ + # Set up two stacks of contexts + self._contexts_idx = [InputSitemap._ContextSitemapIndex(), + InputSitemap._ContextSitemap(), + InputSitemap._ContextValue()] + + self._contexts_stm = [InputSitemap._ContextUrlSet(), + InputSitemap._ContextUrl(consumer), + InputSitemap._ContextValue()] + + # Process the first file + assert self._pathlist + path = self._pathlist[0] + self._contexts = None # We allow an index file here + self._ProcessFile(path) + + # Iterate over remaining files + self._contexts = self._contexts_stm # No index files allowed + for path in self._pathlist[1:]: + self._ProcessFile(path) + #end def ProduceURLs + + def _ProcessFile(self, path): + """Do per-file reading/parsing/consuming for the file path passed in.""" + assert path + + # Open our file + (frame, file) = OpenFileForRead(path, 'SITEMAP') + if not file: + return + + # Rev up the SAX engine + try: + self._current = -1 + xml.sax.parse(file, self) + except SchemaError: + output.Error('An error in file "%s" made us abort reading the Sitemap.' + % path) + except IOError: + output.Error('Cannot read from file "%s"' % path) + except xml.sax._exceptions.SAXParseException, e: + output.Error('XML error in the file "%s" (line %d, column %d): %s' % + (path, e._linenum, e._colnum, e.getMessage())) + + # Clean up + file.close() + if frame: + frame.close() + #end def _ProcessFile + + def _MungeLocationListIntoFiles(self, urllist): + """Given a list of URLs, munge them into our self._pathlist property. + We do this by assuming all the files live in the same directory as + the first file in the existing pathlist. That is, we assume a + Sitemap index points to Sitemaps only in the same directory. This + is not true in general, but will be true for any output produced + by this script. + """ + assert self._pathlist + path = self._pathlist[0] + path = os.path.normpath(path) + dir = os.path.dirname(path) + wide = False + if type(path) == types.UnicodeType: + wide = True + + for url in urllist: + url = URL.Canonicalize(url) + output.Log('Index points to Sitemap file at: %s' % url, 2) + (scheme, netloc, path, query, frag) = urlparse.urlsplit(url) + file = os.path.basename(path) + file = urllib.unquote(file) + if wide: + file = encoder.WidenText(file) + if dir: + file = dir + os.sep + file + if file: + self._pathlist.append(file) + output.Log('Will attempt to read Sitemap file: %s' % file, 1) + #end def _MungeLocationListIntoFiles + + def startElement(self, tag, attributes): + """SAX processing, called per node in the config stream. + As long as the new tag is legal in our current context, this + becomes an Open call on one context deeper. + """ + # If this is the document node, we may have to look for a context stack + if (self._current < 0) and not self._contexts: + assert self._contexts_idx and self._contexts_stm + if tag == 'urlset': + self._contexts = self._contexts_stm + elif tag == 'sitemapindex': + self._contexts = self._contexts_idx + output.Log('File is a Sitemap index.', 2) + else: + output.Error('The document appears to be neither a Sitemap nor a ' + 'Sitemap index.') + raise SchemaError + + # Display a kinder error on a common mistake + if (self._current < 0) and (self._contexts == self._contexts_stm) and ( + tag == 'sitemapindex'): + output.Error('A Sitemap index can not refer to another Sitemap index.') + raise SchemaError + + # Verify no unexpected attributes + if attributes: + text = '' + for attr in attributes.keys(): + # The document node will probably have namespaces + if self._current < 0: + if attr.find('xmlns') >= 0: + continue + if attr.find('xsi') >= 0: + continue + if text: + text = text + ', ' + text = text + attr + if text: + output.Warn('Did not expect any attributes on any tag, instead tag ' + '"%s" had attributes: %s' % (tag, text)) + + # Switch contexts + if (self._current < 0) or (self._contexts[self._current].AcceptTag(tag)): + self._current = self._current + 1 + assert self._current < len(self._contexts) + self._contexts[self._current].Open() + else: + output.Error('Can not accept tag "%s" where it appears.' % tag) + raise SchemaError + #end def startElement + + def endElement(self, tag): + """SAX processing, called per node in the config stream. + This becomes a call to Close on one context followed by a call + to Return on the previous. + """ + tag = tag # Avoid warning on unused argument + assert self._current >= 0 + retval = self._contexts[self._current].Close() + self._current = self._current - 1 + if self._current >= 0: + self._contexts[self._current].Return(retval) + elif retval and (self._contexts == self._contexts_idx): + self._MungeLocationListIntoFiles(retval) + #end def endElement + + def characters(self, text): + """SAX processing, called when text values are read. Important to + note that one single text value may be split across multiple calls + of this method. + """ + if (self._current < 0) or ( + not self._contexts[self._current].AcceptText(text)): + if text.strip(): + output.Error('Can not accept text "%s" where it appears.' % text) + raise SchemaError + #end def characters +#end class InputSitemap + + +class FilePathGenerator: + """ + This class generates filenames in a series, upon request. + You can request any iteration number at any time, you don't + have to go in order. + + Example of iterations for '/path/foo.xml.gz': + 0 --> /path/foo.xml.gz + 1 --> /path/foo1.xml.gz + 2 --> /path/foo2.xml.gz + _index.xml --> /path/foo_index.xml + """ + + def __init__(self): + self.is_gzip = False # Is this a GZIP file? + + self._path = None # '/path/' + self._prefix = None # 'foo' + self._suffix = None # '.xml.gz' + #end def __init__ + + def Preload(self, path): + """ Splits up a path into forms ready for recombination. """ + path = encoder.MaybeNarrowPath(path) + + # Get down to a base name + path = os.path.normpath(path) + base = os.path.basename(path).lower() + if not base: + output.Error('Couldn\'t parse the file path: %s' % path) + return False + lenbase = len(base) + + # Recognize extension + lensuffix = 0 + compare_suffix = ['.xml', '.xml.gz', '.gz'] + for suffix in compare_suffix: + if base.endswith(suffix): + lensuffix = len(suffix) + break + if not lensuffix: + output.Error('The path "%s" doesn\'t end in a supported file ' + 'extension.' % path) + return False + self.is_gzip = suffix.endswith('.gz') + + # Split the original path + lenpath = len(path) + self._path = path[:lenpath-lenbase] + self._prefix = path[lenpath-lenbase:lenpath-lensuffix] + self._suffix = path[lenpath-lensuffix:] + + return True + #end def Preload + + def GeneratePath(self, instance): + """ Generates the iterations, as described above. """ + prefix = self._path + self._prefix + if type(instance) == types.IntType: + if instance: + return '%s%d%s' % (prefix, instance, self._suffix) + return prefix + self._suffix + return prefix + instance + #end def GeneratePath + + def GenerateURL(self, instance, root_url): + """ Generates iterations, but as a URL instead of a path. """ + prefix = root_url + self._prefix + retval = None + if type(instance) == types.IntType: + if instance: + retval = '%s%d%s' % (prefix, instance, self._suffix) + else: + retval = prefix + self._suffix + else: + retval = prefix + instance + return URL.Canonicalize(retval) + #end def GenerateURL + + def GenerateWildURL(self, root_url): + """ Generates a wildcard that should match all our iterations """ + prefix = URL.Canonicalize(root_url + self._prefix) + temp = URL.Canonicalize(prefix + self._suffix) + suffix = temp[len(prefix):] + return prefix + '*' + suffix + #end def GenerateURL +#end class FilePathGenerator + + +class PerURLStatistics: + """ Keep track of some simple per-URL statistics, like file extension. """ + + def __init__(self): + self._extensions = {} # Count of extension instances + #end def __init__ + + def Consume(self, url): + """ Log some stats for the URL. At the moment, that means extension. """ + if url and url.loc: + (scheme, netloc, path, query, frag) = urlparse.urlsplit(url.loc) + if not path: + return + + # Recognize directories + if path.endswith('/'): + if self._extensions.has_key('/'): + self._extensions['/'] = self._extensions['/'] + 1 + else: + self._extensions['/'] = 1 + return + + # Strip to a filename + i = path.rfind('/') + if i >= 0: + assert i < len(path) + path = path[i:] + + # Find extension + i = path.rfind('.') + if i > 0: + assert i < len(path) + ext = path[i:].lower() + if self._extensions.has_key(ext): + self._extensions[ext] = self._extensions[ext] + 1 + else: + self._extensions[ext] = 1 + else: + if self._extensions.has_key('(no extension)'): + self._extensions['(no extension)'] = self._extensions[ + '(no extension)'] + 1 + else: + self._extensions['(no extension)'] = 1 + #end def Consume + + def Log(self): + """ Dump out stats to the output. """ + if len(self._extensions): + output.Log('Count of file extensions on URLs:', 1) + set = self._extensions.keys() + set.sort() + for ext in set: + output.Log(' %7d %s' % (self._extensions[ext], ext), 1) + #end def Log + +class Sitemap(xml.sax.handler.ContentHandler): + """ + This is the big workhorse class that processes your inputs and spits + out sitemap files. It is built as a SAX handler for set up purposes. + That is, it processes an XML stream to bring itself up. + """ + + def __init__(self, suppress_notify): + xml.sax.handler.ContentHandler.__init__(self) + self._filters = [] # Filter objects + self._inputs = [] # Input objects + self._urls = {} # Maps URLs to count of dups + self._set = [] # Current set of URLs + self._filegen = None # Path generator for output files + self._wildurl1 = None # Sitemap URLs to filter out + self._wildurl2 = None # Sitemap URLs to filter out + self._sitemaps = 0 # Number of output files + # We init _dup_max to 2 so the default priority is 0.5 instead of 1.0 + self._dup_max = 2 # Max number of duplicate URLs + self._stat = PerURLStatistics() # Some simple stats + self._in_site = False # SAX: are we in a Site node? + self._in_Site_ever = False # SAX: were we ever in a Site? + + self._default_enc = None # Best encoding to try on URLs + self._base_url = None # Prefix to all valid URLs + self._store_into = None # Output filepath + self._suppress = suppress_notify # Suppress notify of servers + #end def __init__ + + def ValidateBasicConfig(self): + """ Verifies (and cleans up) the basic user-configurable options. """ + all_good = True + + if self._default_enc: + encoder.SetUserEncoding(self._default_enc) + + # Canonicalize the base_url + if all_good and not self._base_url: + output.Error('A site needs a "base_url" attribute.') + all_good = False + if all_good and not URL.IsAbsolute(self._base_url): + output.Error('The "base_url" must be absolute, not relative: %s' % + self._base_url) + all_good = False + if all_good: + self._base_url = URL.Canonicalize(self._base_url) + if not self._base_url.endswith('/'): + self._base_url = self._base_url + '/' + output.Log('BaseURL is set to: %s' % self._base_url, 2) + + # Load store_into into a generator + if all_good: + if self._store_into: + self._filegen = FilePathGenerator() + if not self._filegen.Preload(self._store_into): + all_good = False + else: + output.Error('A site needs a "store_into" attribute.') + all_good = False + + # Ask the generator for patterns on what its output will look like + if all_good: + self._wildurl1 = self._filegen.GenerateWildURL(self._base_url) + self._wildurl2 = self._filegen.GenerateURL(SITEINDEX_SUFFIX, + self._base_url) + + # Unify various forms of False + if all_good: + if self._suppress: + if (type(self._suppress) == types.StringType) or (type(self._suppress) + == types.UnicodeType): + if (self._suppress == '0') or (self._suppress.lower() == 'false'): + self._suppress = False + + # Done + if not all_good: + output.Log('See "example_config.xml" for more information.', 0) + return all_good + #end def ValidateBasicConfig + + def Generate(self): + """ Run over all the Inputs and ask them to Produce """ + # Run the inputs + for input in self._inputs: + input.ProduceURLs(self.ConsumeURL) + + # Do last flushes + if len(self._set): + self.FlushSet() + if not self._sitemaps: + output.Warn('No URLs were recorded, writing an empty sitemap.') + self.FlushSet() + + # Write an index as needed + if self._sitemaps > 1: + self.WriteIndex() + + # Notify + self.NotifySearch() + + # Dump stats + self._stat.Log() + #end def Generate + + def ConsumeURL(self, url, allow_fragment): + """ + All per-URL processing comes together here, regardless of Input. + Here we run filters, remove duplicates, spill to disk as needed, etc. + """ + if not url: + return + + # Validate + if not url.Validate(self._base_url, allow_fragment): + return + + # Run filters + accept = None + for filter in self._filters: + accept = filter.Apply(url) + if accept != None: + break + if not (accept or (accept == None)): + url.Log(prefix='FILTERED', level=2) + return + + # Ignore our out output URLs + if fnmatch.fnmatchcase(url.loc, self._wildurl1) or fnmatch.fnmatchcase( + url.loc, self._wildurl2): + url.Log(prefix='IGNORED (output file)', level=2) + return + + # Note the sighting + hash = url.MakeHash() + if self._urls.has_key(hash): + dup = self._urls[hash] + if dup > 0: + dup = dup + 1 + self._urls[hash] = dup + if self._dup_max < dup: + self._dup_max = dup + url.Log(prefix='DUPLICATE') + return + + # Acceptance -- add to set + self._urls[hash] = 1 + self._set.append(url) + self._stat.Consume(url) + url.Log() + + # Flush the set if needed + if len(self._set) >= MAXURLS_PER_SITEMAP: + self.FlushSet() + #end def ConsumeURL + + def FlushSet(self): + """ + Flush the current set of URLs to the output. This is a little + slow because we like to sort them all and normalize the priorities + before dumping. + """ + + # Sort and normalize + output.Log('Sorting and normalizing collected URLs.', 1) + self._set.sort() + for url in self._set: + hash = url.MakeHash() + dup = self._urls[hash] + if dup > 0: + self._urls[hash] = -1 + if not url.priority: + url.priority = '%.4f' % (float(dup) / float(self._dup_max)) + + # Get the filename we're going to write to + filename = self._filegen.GeneratePath(self._sitemaps) + if not filename: + output.Fatal('Unexpected: Couldn\'t generate output filename.') + self._sitemaps = self._sitemaps + 1 + output.Log('Writing Sitemap file "%s" with %d URLs' % + (filename, len(self._set)), 1) + + # Write to it + frame = None + file = None + + try: + if self._filegen.is_gzip: + basename = os.path.basename(filename); + frame = open(filename, 'wb') + file = gzip.GzipFile(fileobj=frame, filename=basename, mode='wt') + else: + file = open(filename, 'wt') + + file.write(SITEMAP_HEADER) + for url in self._set: + url.WriteXML(file) + file.write(SITEMAP_FOOTER) + + file.close() + if frame: + frame.close() + + frame = None + file = None + except IOError: + output.Fatal('Couldn\'t write out to file: %s' % filename) + os.chmod(filename, 0644) + + # Flush + self._set = [] + #end def FlushSet + + def WriteIndex(self): + """ Write the master index of all Sitemap files """ + # Make a filename + filename = self._filegen.GeneratePath(SITEINDEX_SUFFIX) + if not filename: + output.Fatal('Unexpected: Couldn\'t generate output index filename.') + output.Log('Writing index file "%s" with %d Sitemaps' % + (filename, self._sitemaps), 1) + + # Make a lastmod time + lastmod = TimestampISO8601(time.time()) + + # Write to it + try: + fd = open(filename, 'wt') + fd.write(SITEINDEX_HEADER) + + for mapnumber in range(0,self._sitemaps): + # Write the entry + mapurl = self._filegen.GenerateURL(mapnumber, self._base_url) + mapattributes = { 'loc' : mapurl, 'lastmod' : lastmod } + fd.write(SITEINDEX_ENTRY % mapattributes) + + fd.write(SITEINDEX_FOOTER) + + fd.close() + fd = None + except IOError: + output.Fatal('Couldn\'t write out to file: %s' % filename) + os.chmod(filename, 0644) + #end def WriteIndex + + def NotifySearch(self): + """ Send notification of the new Sitemap(s) to the search engines. """ + if self._suppress: + output.Log('Search engine notification is suppressed.', 1) + return + + output.Log('Notifying search engines.', 1) + + # Override the urllib's opener class with one that doesn't ignore 404s + class ExceptionURLopener(urllib.FancyURLopener): + def http_error_default(self, url, fp, errcode, errmsg, headers): + output.Log('HTTP error %d: %s' % (errcode, errmsg), 2) + raise IOError + #end def http_error_default + #end class ExceptionURLOpener + old_opener = urllib._urlopener + urllib._urlopener = ExceptionURLopener() + + # Build the URL we want to send in + if self._sitemaps > 1: + url = self._filegen.GenerateURL(SITEINDEX_SUFFIX, self._base_url) + else: + url = self._filegen.GenerateURL(0, self._base_url) + + # Test if we can hit it ourselves + try: + u = urllib.urlopen(url) + u.close() + except IOError: + output.Error('When attempting to access our generated Sitemap at the ' + 'following URL:\n %s\n we failed to read it. Please ' + 'verify the store_into path you specified in\n' + ' your configuration file is web-accessable. Consult ' + 'the FAQ for more\n information.' % url) + output.Warn('Proceeding to notify with an unverifyable URL.') + + # Cycle through notifications + # To understand this, see the comment near the NOTIFICATION_SITES comment + for ping in NOTIFICATION_SITES: + query_map = ping[3] + query_attr = ping[5] + query_map[query_attr] = url + query = urllib.urlencode(query_map) + notify = urlparse.urlunsplit((ping[0], ping[1], ping[2], query, ping[4])) + + # Send the notification + output.Log('Notifying: %s' % ping[1], 1) + output.Log('Notification URL: %s' % notify, 2) + try: + u = urllib.urlopen(notify) + u.read() + u.close() + except IOError: + output.Warn('Cannot contact: %s' % ping[1]) + + if old_opener: + urllib._urlopener = old_opener + #end def NotifySearch + + def startElement(self, tag, attributes): + """ SAX processing, called per node in the config stream. """ + + if tag == 'site': + if self._in_site: + output.Error('Can not nest Site entries in the configuration.') + else: + self._in_site = True + + if not ValidateAttributes('SITE', attributes, + ('verbose', 'default_encoding', 'base_url', 'store_into', + 'suppress_search_engine_notify')): + return + + verbose = attributes.get('verbose', 0) + if verbose: + output.SetVerbose(verbose) + + self._default_enc = attributes.get('default_encoding') + self._base_url = attributes.get('base_url') + self._store_into = attributes.get('store_into') + if not self._suppress: + self._suppress = attributes.get('suppress_search_engine_notify', + False) + self.ValidateBasicConfig() + + elif tag == 'filter': + self._filters.append(Filter(attributes)) + + elif tag == 'url': + self._inputs.append(InputURL(attributes)) + + elif tag == 'urllist': + for attributeset in ExpandPathAttribute(attributes, 'path'): + self._inputs.append(InputURLList(attributeset)) + + elif tag == 'directory': + self._inputs.append(InputDirectory(attributes, self._base_url)) + + elif tag == 'accesslog': + for attributeset in ExpandPathAttribute(attributes, 'path'): + self._inputs.append(InputAccessLog(attributeset)) + + elif tag == 'sitemap': + for attributeset in ExpandPathAttribute(attributes, 'path'): + self._inputs.append(InputSitemap(attributeset)) + + else: + output.Error('Unrecognized tag in the configuration: %s' % tag) + #end def startElement + + def endElement(self, tag): + """ SAX processing, called per node in the config stream. """ + if tag == 'site': + assert self._in_site + self._in_site = False + self._in_site_ever = True + #end def endElement + + def endDocument(self): + """ End of SAX, verify we can proceed. """ + if not self._in_site_ever: + output.Error('The configuration must specify a "site" element.') + else: + if not self._inputs: + output.Warn('There were no inputs to generate a sitemap from.') + #end def endDocument +#end class Sitemap + + +def ValidateAttributes(tag, attributes, goodattributes): + """ Makes sure 'attributes' does not contain any attribute not + listed in 'goodattributes' """ + all_good = True + for attr in attributes.keys(): + if not attr in goodattributes: + output.Error('Unknown %s attribute: %s' % (tag, attr)) + all_good = False + return all_good +#end def ValidateAttributes + +def ExpandPathAttribute(src, attrib): + """ Given a dictionary of attributes, return a list of dictionaries + with all the same attributes except for the one named attrib. + That one, we treat as a file path and expand into all its possible + variations. """ + # Do the path expansion. On any error, just return the source dictionary. + path = src.get(attrib) + if not path: + return [src] + path = encoder.MaybeNarrowPath(path); + pathlist = glob.glob(path) + if not pathlist: + return [src] + + # If this isn't actually a dictionary, make it one + if type(src) != types.DictionaryType: + tmp = {} + for key in src.keys(): + tmp[key] = src[key] + src = tmp + + # Create N new dictionaries + retval = [] + for path in pathlist: + dst = src.copy() + dst[attrib] = path + retval.append(dst) + + return retval +#end def ExpandPathAttribute + +def OpenFileForRead(path, logtext): + """ Opens a text file, be it GZip or plain """ + + frame = None + file = None + + if not path: + return (frame, file) + + try: + if path.endswith('.gz'): + frame = open(path, 'rb') + file = gzip.GzipFile(fileobj=frame, mode='rt') + else: + file = open(path, 'rt') + + if logtext: + output.Log('Opened %s file: %s' % (logtext, path), 1) + else: + output.Log('Opened file: %s' % path, 1) + except IOError: + output.Error('Can not open file: %s' % path) + + return (frame, file) +#end def OpenFileForRead + +def TimestampISO8601(t): + """Seconds since epoch (1970-01-01) --> ISO 8601 time string.""" + return time.strftime('%Y-%m-%dT%H:%M:%SZ', time.gmtime(t)) +#end def TimestampISO8601 + +def CreateSitemapFromFile(configpath, suppress_notify): + """ Sets up a new Sitemap object from the specified configuration file. """ + + # Remember error count on the way in + num_errors = output.num_errors + + # Rev up SAX to parse the config + sitemap = Sitemap(suppress_notify) + try: + output.Log('Reading configuration file: %s' % configpath, 0) + xml.sax.parse(configpath, sitemap) + except IOError: + output.Error('Cannot read configuration file: %s' % configpath) + except xml.sax._exceptions.SAXParseException, e: + output.Error('XML error in the config file (line %d, column %d): %s' % + (e._linenum, e._colnum, e.getMessage())) + except xml.sax._exceptions.SAXReaderNotAvailable: + output.Error('Some installs of Python 2.2 did not include complete support' + ' for XML.\n Please try upgrading your version of Python' + ' and re-running the script.') + + # If we added any errors, return no sitemap + if num_errors == output.num_errors: + return sitemap + return None +#end def CreateSitemapFromFile + +def ProcessCommandFlags(args): + """ + Parse command line flags per specified usage, pick off key, value pairs + All flags of type "--key=value" will be processed as __flags[key] = value, + "--option" will be processed as __flags[option] = option + """ + + flags = {} + rkeyval = '--(?P\S*)[=](?P\S*)' # --key=val + roption = '--(?P