Tugger the SLUGger!SLUG Mailing List Archives

Re: [Re: [SLUG] How to Use


On 24 May, Matthew Palmer wrote:
>  > Louis> I just tried "wget -r http://www.domain.com/dir/"; suggestion, and the 
>  > following happened: 
>  >  
>  > 1. Created a dir called "www.domain.com/dir"; 
>  > 2. In "www.domain.com/dir", I only see the index.html file. 
>   
>  Does the index.html file have any links to other pages on the same site?  If 
>  not, then wget has done it's job - it has taken a recursive snapshot of the 
>  website. 
>   
>  If index.html has links to other files residing on the same site, then we do 
>  have problems, and if you could furnish us with more info on the structure 
>  of the site (including it's actual location, if feasible) we might be able 
>  to give more hints. 

Louis, it sounds like what you did *should* have worked, for most web
sites.

Here's a script I use to snapshot a web site that I and several other
people maintain.  I keep it a local copy under CVS control.  This way,
no information can be lost.

It won't help you much, though, if the site doesn't let you do a wget -r.
:-(

luke
#!/bin/sh
#
# Take a snapshot of a web site, update local files under cvs control,
# make suggestions about adding files if new ones appear,
# and output the results in a simple summarised form.
#
# Designed to be run from cron if necessary, even by root.
#
# Author: Luke Kendall
#
# Copyright (C) Luke Kendall 2002, but hereby placed into the public domain.
#
LOCAL_COPY=
MYNAME=`basename "$0"`
OWNER=${LOGNAME:-$USER}
QUOTA=-Q50M
SOURCE_URL=
USAGE="usage: $MYNAME [-owner logname] [-dir local-directory] [-quota <number>M]
    -url www.something.com/some-directory/
Where:
    -dir   local-directory is the directory where the site should be copied to.
           This should be specified *after* -owner, if a different owner is
	   required.
    -owner logname is the login name of the person considered to be the owner
           of the files being downloaded.  Useful if you want the downloads
	   to be done by root.  This option is only useful for root.
	   Defaults to login name of the user.
    -quota You have to specify a that wget understands.  The default is 50M,
           meaning 50 megabytes.
    -url   You have to provide a URL.  Leave off the http://.
Example:
    urlsnapshot -owner luke -dir smrg -url www.geocities.com/sydsmrg/"

usage()
{
    echo "$USAGE" >&2
    exit 1
}

while [ $# != 0 ]
do
    [ $# -lt 2 ] && usage
    case $1 in
	-dir)
	    for LOCAL_COPY in $HOME_DIR/$2 ./$2
	    do
		if [ -d "$LOCAL_COPY" ]
		then
		    break
		fi
	    done
	    ;;
	-owner)
	    OWNER=$2
	    HOME_DIR=`grep "^$OWNER:" /etc/passwd | cut -d : -f 6`
	    if [ -z "$HOME_DIR" ]
	    then
		echo "$MYNAME: $OWNER not found in /etc/passwd" >&2
		exit 1
	    fi
	    ;;
	-quota)
	    if [ "$2" = "0" -o "$2" = "0M" ]
	    then
		QUOTA=
	    else
		QUOTA=-Q$2
	    fi
	    ;;
	-url)
	    SOURCE_URL=$2
	    ;;
	*)
	    echo "$MYNAME: unknown options $*" >&2
	    usage
	;;
    esac
    shift 2
done
[ $# != 0 ] && usage
[ -z "$SOURCE_URL" ] && usage


cd "$LOCAL_COPY" || exit 1
if [ ! -w . ]
then
    echo "$MYNAME: no write permission in `pwd`" >&2
    exit 1
fi

#
# Capture normal output and overwrite nightly log.
# Error output is allowed through unchanged.
#
{
    #
    # Append to log file; non-verbose; 50Mb quota; don't recurse
    # upward into the parent directory; recursive fetch.
    #
    wget -a $LOCAL_COPY/sms.log -nv $QUOTA -np -r "$SOURCE_URL"
    chown -R $OWNER .
    cd "$SOURCE_URL"
    cvs diff
    WHEN=`date "+%a %e %b %Y"`
    #
    # cvs requires you to be su-ed to root, or at least LOGNAME
    # set to be your real account.
    #
    export LOGNAME=$OWNER
    cvs update > /tmp/sms$$ 2>&1
    cvs commit -m "Nightly snapshot for $WHEN"
} > $LOCAL_COPY/sms.log

#
# Report less-significant changes:
#
if grep -v "^?" /tmp/sms$$ > /dev/null
then
    echo "Changed files:"
    grep -v "^?" /tmp/sms$$
fi

if grep "^[?MP]" /tmp/sms$$ > /dev/null
then
    if grep "^?" /tmp/sms$$ > /dev/null
    then
	echo "New files now in local cache appear prefixed with \`?':"
	grep "^?" /tmp/sms$$
    fi
    cat <<-EOF
    You should consider doing this:
    	cd $LOCAL_COPY/$SOURCE_URL
	EOF
    while read status fname
    do
	if [ "x$status" = "x?" ]
	then
	    if file "$fname" | grep data > /dev/null
	    then
		bin="-kb"
	    elif file "$fname" | grep text > /dev/null
	    then
		bin=""
		#
		# Don't do this, below.  It would mean that every run, wget
		# would see that the source file was bigger, and download
		# it again.  We'd have to replicate the directory elsewhere,
		# and use that altered copy for cvs control and uploads.
		#####
		# Tidy up any freshly-fetched file.
		#ex - "$fname" <<-'EOF'
		#	/<!-- text below generated by server. PLEASE REMOVE/,$d
		#	x
		#EOF
	    else
		echo "# This one seems weird: `file \"$fname\"`"
	    fi
	    echo "	cvs add $bin '$fname'"
	elif [ "x$status" = "xM" ]
	then
	    echo "# $fname has been automatically committed."
	    echo "# It is assumed that you uploaded the changed file."
	fi
    done < /tmp/sms$$
    echo "	cvs diff"
    echo "	cvs commit"
else
    echo "No new files on web site"
fi
rm -f /tmp/$MYNAME.log
mv /tmp/sms$$ /tmp/$MYNAME.log
chown $OWNER /tmp/$MYNAME.log