# archive_types.pl
#
# Provides %TYPES which is a hash of types (eg 'tar.gz') to hashes
# of 'pack', 'unpack' and 'list' programs.
#
# The 'pack' command is such that 'pack dirname' will archive dirname
# and splurge the archive to stdout.
#
# 'unpack' reads the archive from stdin, unpacks it into the current
# directory, and prints filenames to stdout.  'list' just lists
# filenames without actually unpacking.
#
# So to unpack a .tar.gz file:
#
# system "cat file.tar.gz | $TYPES{tar.gz}->{unpack}";
#
# (beware!  don't just say '$TYPES{foo}->{unpack} <$file' - what if
# $TYPES{foo}->{unpack} is itself a pipeline?)
#
# Or to archive the directory 'dir' into a zipfile printed to stdout:
#
# system "$TYPES{zip}->{pack} dir";
#
# To add new archive types, you'll need to make a hash of commands,
# and then add it to the hash that maps 'types' to commands.  Take a
# look at %TAR for a fairly simple example.  Then test it by running
# this file with 'perl archive_types.pl --test-archivers'.
#
# Maybe one day this will become a proper module.  Maybe one day the
# stuff like my_system() might move to a library.  Maybe one day I'll
# write a proper test suite.
#
# This file is in the public domain.  Use at your own risk.
# <http://membled.com/work/apps/unarc/>.
#
# Part of unarc version 0.2.8.
#
# -- Ed Avis, ed@membled.com, 2002-11-15
#

# Check if we should run the test cases.  This will mess up if a
# program including this file has a '--test-archivers' argument, but
# never mind that.
#
my $RUN_TESTS = (@ARGV == 1 and $ARGV[0] eq '--test-archivers');

# Whether to exercise the test suite by including some extra 'types'
# which don't work.
#
my $TEST_BROKEN = 0;

sub maphash(&%);
sub mhash(@);
sub writefile($$);
sub list_diff($$);
sub unpack_ok($$$$$);
sub list_ok($$$);
sub my_system($;$$$);

# Hashes of pack, unpack, list commands for different file formats.

# cpio archives
#
# The shenanigans with sh -c are because the pack command must accept
# the directory name to be packed as its first and only argument.
# Also cpio has the annoying habit of mixing filenames with error
# messages, hence the attempt to filter them out again.
#
my %CPIO = (pack   => q[sh -c 'find $0 -depth -print0 | cpio -o0 --quiet'],
	    unpack => 'cpio -iv --make-directories --quiet -M "" 2>&1'
                      . ' | perl -ne "if (/^cpio:/)'
                      . ' { print STDERR } else { print }"',
	    list   => 'cpio --list --quiet' );

# RPMs can be filtered through rpm2cpio and then treated as cpio
# archives.  We can't create them though.
#
# You need the program pip
# <http://membled.com/work/apps/pip/> to work around the
# fact that newer versions of rpm2cpio cannot read from stdin.
#
my %RPM = ((maphash { "pip -i rpm2cpio - | $_" } %CPIO),
	   pack => undef );

# tar archives
my %TAR = (pack => 'tar -c', unpack => 'tar -xv', list => 'tar -t');

# Gzipped tar archives use tar and gzip
#
# We could use the -z flag to tar, but that bulks out the archive
# with zero bytes at the end so it's a whole number of blocks.  This
# then causes tar -z to complain when it unpacks the archive.
#
my %TAR_GZ = (pack   => "sh -c '$TAR{pack} \$0 | gzip'",
	      unpack => "gzip -d | $TAR{unpack}",
	      list   => "gzip -d | $TAR{list}" );

# Bzip2ed tar archives use tar and bzip2
my %TAR_BZ2 = (pack   => "sh -c '$TAR{pack} \$0 | bzip2'",
	       unpack => "bzip2 -d | $TAR{unpack}",
	       list   => "bzip2 -d | $TAR{list}" );

# Lzop <http://wildsau.idv.uni-linz.ac.at/mfx/lzop.html> + tar
my %TAR_LZO = (pack   => "sh -c '$TAR{pack} \$0 | lzop'",
	       unpack => "lzop -d | $TAR{unpack}",
	       list   => "lzop -d | $TAR{list}" );

# pip needed for zipfiles
my %ZIP = (pack   => 'zip -qr -',

	   # Unzip doesn't have an option to just print filenames!
	   unpack => 'pip -i unzip -L - | ' .
	   q[perl -ne 'if (/^\s+[a-z]+: (.+?)\s*$/) { print "$1\n" }'],

	   list   => 'pip -i zipinfo -1 -');

# pip 0.2 or later is needed for SEA arc.  Unfortunately
# there doesn't seem to be any way to store subdirectories with arc,
# so packing isn't allowed.
#
my %ARC = (pack   => undef,
	   unpack => 'pip -i arc x - | ' .
	   q[perl -ne 's/^Extracting file: // ? print : print STDERR'],
	   list   => 'pip -i arc l - | ' .
	   q[perl -ane 'if (/^\s*=/) { ++$e }
                        else { print "$F[0]\n" if $e == 1 } '] );

# Hash of types and associated commands.
use vars '%TYPES';
%TYPES = mhash([ qw(tar)                               ] => \%TAR,
	       [ qw(tar.gz tar.Z tar.z tgz taz etheme) ] => \%TAR_GZ,
	       [ qw(tar.bz2 bz2)                       ] => \%TAR_BZ2,
	       [ qw(tar.lzo)                           ] => \%TAR_LZO,
	       [ qw(zip ZIP jar)                       ] => \%ZIP,
	       [ qw(cpio)                              ] => \%CPIO,
	       [ qw(rpm)                               ] => \%RPM,
	       [ qw(arc ARC)                           ] => \%ARC  );

if ($RUN_TESTS and $TEST_BROKEN) {
    # Add some extra archivers which will break

    # Packing returns an error
    $TYPES{BROKEN_0} = { pack   => 'perl -e "exit 1"' };

    # Unpacking returns an error
    $TYPES{BROKEN_1} = { pack   => 'tar -c',
			 unpack => 'perl -e "exit 1"' };

    # Unpacking gives the wrong list of filenames
    $TYPES{BROKEN_2} = { pack   => 'tar -c',
			 unpack => 'echo hello world' };

    # Unpacking appears to work, but the contents are wrong
    $TYPES{BROKEN_3} = { pack   => 'tar -c',
			 unpack => 'tar -xv | '
        . q[perl -pe 'system("echo hello >>$_")' 2>/dev/null]
		       };

    # Listing returns an error
    $TYPES{BROKEN_4} = { pack   => 'tar -c',
			 unpack => 'tar -xv',
			 list   => 'perl -e "exit 1"' };

    # Listing gives the wrong list of filenames
    $TYPES{BROKEN_5} = { pack   => 'tar -c',
			 unpack => 'tar -xv',
			 list   => 'echo not again' };
}


# Default archive type
use vars '$DEF_TYPES';
$DEF_TYPE = 'tar.bz2';


# maphash()
#
# Like map but for hashes instead of lists.
#
# Parameters:
#   function to apply to each value
#   hash to be mapped
#
sub maphash(&%) {
    my ($f, %h) = @_;
    my %r;
    foreach my $k (keys %h) {
	local $_ = $h{$k};
	$r{$k} = $f->();
    }
    return %r;
}


# mhash()
#
# Build a hash where several keys point to the same value.
# Parameters are a list of (ref to list of keys) => (value), eg
#
# mhash(['a', 'b'] => 1, ['c'] => 2)
#
# would return the hash (a => 1, b => 1, c => 2).
#
sub mhash(@) {
    my %r;
    while (@_) {
	die 'odd number of arguments' if @_ == 1;
	my $keys = shift;
	my $v = shift;
	foreach (@$keys) {
	    $r{$_} = $v;
	}
    }
    return %r;
}


# Run the test cases below, if wanted.
if ($RUN_TESTS) {
    eval join('', <DATA>);
    die $@ if $@;
}

1;

__END__

# Test suite for archive_types.pl.  This goes through each of the
# types, creating an archive, then unpacking and listing it.
#
# To do: we should not be so chatty.  In the meantime, try this:
#
# % perl archive_types.pl --test-archivers | grep -E 'fail|ok|skip'
#

# Work out a prefix for temporary files.
(my $progname = $0) =~ s!.*/!!;
my $prefix = "$$.$progname";
die "files beginning $prefix already exist" if <$prefix*>;

if (not defined $TYPES{$DEF_TYPE}) {
    print "default type $DEF_TYPE not defined in \%TYPES\n";
}

# Create a directory to archive.
my $testdir = "$prefix.dir";
mkdir $testdir, 0777 or die "cannot mkdir $testdir: $!";

# After archiving, we'll rename it to this so that we can unpack the
# archive we created.
#
my $testdir_orig = "$testdir.orig";

# Create some stuff inside this directory.  We avoid
# Unix-specific things like symlinks and hard links.
#
writefile("$testdir/empty", '');
writefile("$testdir/hello", 'hello');
mkdir "$testdir/emptydir", 0777
  or die "cannot mkdir $testdir/emptydir: $!";
mkdir "$testdir/nonemptydir", 0777
  or die "cannot mkdir $testdir/nonemptydir: $!";
writefile("$testdir/nonemptydir/goodbye", 'goodbye');

# Get the list of files in $testdir
open(FIND, "find $testdir |")
  or die "cannot run find $testdir: $!";
chomp(my @files = sort <FIND>);

# Now test each of the archive types in turn.
foreach my $type (sort keys %TYPES) {
    print "Testing type $type\n";
    my %cmds = %{$TYPES{$type}};
    my ($pack, $unpack, $list) = @cmds{qw[pack unpack list]};
    my $skiptests = 0;

    # At the moment, we have to create archives before we can test
    # unpacking or listing them.  In the future we might have a
    # standard set of pre-built archives to test against.
    #
    if (not defined $pack) {
	print "pack command undefined, unable to test\n";
	print "$type skipped\n";
	next;
    }

    # Archive $testdir into a file.
    my $archive = "$prefix.$type";
    print "packing with command '$pack'\n";
    my $cmd = "$pack $testdir >$archive";
    my ($rc, $out, $err);
    my_system($cmd, \$rc, \$out, \$err)
      or print("$type pack fail\n"), $skiptests = 1;

    if ($out ne '') {
	print "$type pack produced strange messages: $out\n";
	# but don't skip the other tests, it's not serious
    }

    # Rename $testdir to get it out of the way, so we can unpack.
    rename $testdir => $testdir_orig
      or die "cannot rename $testdir to $testdir_orig: $!";

    my $anyfail = 0;

    # Test the 'unpack' command
    if (defined $unpack) {
	if ($skiptests) {
	    print "$type unpack skipped\n";
	}
	elsif (unpack_ok($unpack, $archive,
			 $testdir, $testdir_orig,
			 \@files ))
	{
	    print "$type unpack ok\n";
	}
	else {
	    print "$type unpack fail\n";
	    $anyfail = 1;
	}
    }

    # Test the 'list' command
    if (defined $list) {
	if ($skiptests) {
	    print "$type list skipped\n";
	}
	elsif (list_ok($list, $archive, \@files)) {
	    print "$type list ok\n";
	}
	else {
	    print "$type list fail\n";
	    $anyfail = 1;
	}
    }

    if ($anyfail) {
	print "preserving $archive\n";
    }
    else {
	unlink $archive or die "cannot unlink $archive: $!";
    }

    my_system("rm -rf $testdir") or die "cannot rm -rf testdir: $!";
    rename $testdir_orig => $testdir
      or die "cannot rename $testdir_orig to $testdir: $!";
}

# Finished testing.
my_system("rm -rf $testdir") or die "cannot rm -rf testdir: $!";


# writefile()
#
# Create a file with given contents, die if error.
#
# Parameters:
#   filename
#   contents
#
sub writefile($$) {
    die 'usage: writefile(filename, contents)' if @_ != 2;
    my ($filename, $contents) = @_;
    local *F;
    open(F, ">$filename") or die "can't open $filename for writing: $!";
    print F $contents     or die "can't write to $filename: $!";
    close F               or die "can't close $filename: $!";
}


# list_diff()
#
# Do two lists differ?
#
# Parameters:
#   reference to first list
#   reference to second list
#
# Returns true iff they differ.  Only works for lists of strings.
#
sub list_diff($$) {
    die 'usage: list_diff(ref to first list, ref to second list)'
      if @_ != 2;
    my ($a, $b) = @_;
    my @a = @$a; my @b = @$b;
    return 1 if (scalar @a) != (scalar @b);
    for (my $i = 0; $i < @a; $i++) {
	return 1 if $a[$i] ne $b[$i];
    }
    return 0;
}


# unpack_ok()
#
# Run an unpack command, and check that it produces the right results.
#
# Parameters:
#   command to run
#   archive to feed it on stdin
#   directory that command should create
#   original directory to compare contents with
#   ref to sorted list of filenames that should be printed
#
# Returns:
#   whether the tests passed
#
# Prints messages to stdout if tests fail; the command may itself
# complain to stderr.
#
sub unpack_ok($$$$$) {
    die 'usage: unpack_ok(command, archive, dir, orig dir, files listref)'
      if @_ != 5;
    my ($cmd, $arc, $dir, $orig_dir, $files) = @_;

    print "testing unpack command: $cmd\n";
    open(UNPACK, "{ $cmd ; } <$arc |")
      or print "cannot execute: $!", return 0;

    # The command should print the names of files unpacked
    chomp(my @unpacked_files = sort <UNPACK>);
    s!/$!! foreach @unpacked_files;

    # Check the exit status of the command
    if (not close UNPACK) {
	# Something went wrong
	if ($! == 0) {
	    # The only problem is non-zero exit status
	    my $status = $? / 256;
	    print "$cmd returned non-zero exit status: $status\n";
	    return 0;
	}
	else {
	    # Something more serious
	    die "could not pipe archive to $cmd: $!";
	}
    }

    # Check that the names given are the same as originally archived
    if (list_diff($files, \@unpacked_files)) {
	print "$cmd reports different list of files\n";
	print "original files: ", join(', ', @$files), "\n";
	print "unpacked files: ", join(', ', @unpacked_files), "\n";
	return 0;
    }

    # Check that the archive really did unpack
    if (not -d $dir) {
	print "$cmd did not create $dir";
	return 0;
    }

    # Okay, it ran, printed the right names, and created $dir.  But
    # are the contents correct?  Use diff to find out.
    #
    my ($rc, $out, $err);
    my_system("diff -qr $testdir_orig $testdir", \$rc, \$out, \$err);

    # Now check what diff thought.
    if (not defined $rc or not defined $out) {
	# Something went wrong.
	if (defined $err) {
	    die "failed to execute diff: $err";
	}
	else {
	    # Shouldn't get here.
	    die "diff failed, and my_system() seems broken too";
	}
    }
    elsif ($rc == 0) {
	# No differences found.  All done.
	warn "diff produced spurious output: $out" if $out ne '';
	return 1;
    }
    elsif ($rc == 1) {
	# Diff found something - it will be in $out.
	print "differences found: $out\n";
	return 0;
    }
    elsif ($rc == 2) {
	print "diff returned error code\n";
	print "and printed: $out\n" if $out ne '';
	return 0;
    }
    else {
	die "unexpected return code from diff: $rc (printed '$out')";
    }

    # Remove the directory we unpacked
    my_system("rm -rf $testdir") or die "cannot rm -rf $testdir: $!";
}


# list_ok()
#
# Run a 'list' command, and check it produces the right results.
#
# Parameters:
#   command to run
#   archive to feed it on stdin
#   ref to sorted list of filenames expected
#
# Returns:
#   whether the tests passed
#
# Will print stuff to stdout if there is a failure.
#
sub list_ok($$$) {
    die 'usage: list_ok(command, archive, ref to list of filenames)'
      if @_ != 3;
    my ($cmd, $arc, $files) = @_;

    print "testing list command: $cmd\n";
    open(LIST, "{ $cmd ; } <$arc |")
      or print "cannot execute: $!", return 0;

    # The command should print the names of files unpacked
    chomp(my @got_files = sort <LIST>);
    s!/$!! foreach @got_files;

    # Check the exit status of the command
    if (not close LIST) {
	# Something went wrong
	if ($! == 0) {
	    # The only problem is non-zero exit status
	    my $status = $? / 256;
	    print "$cmd returned non-zero exit status: $status\n";
	    return 0;
	}
	else {
	    # Something more serious
	    die "could not pipe archive to $cmd: $!";
	}
    }

    # Check that the names given are the same as originally archived
    if (list_diff($files, \@got_files)) {
	print "$cmd reports different list of files\n";
	print "original files: ", join(', ', @$files), "\n";
	print "listed files: ", join(', ', @got_files), "\n";
	return 0;
    }

    # All okay.
    return 1;
}


# my_system()
#
# Run an external command, checking for errors.
#
# Parameters:
#   command to run (single string)
#   reference to scalar where exit status will be placed
#   reference to scalar where command output will be placed
#   reference to scalar where error message will be placed
#
# Any of the references can be undef if you're not interested in
# that value.
#
# Returns:
#   success value
#
# The behaviour is as follows:
#
# - If the command dumped core, or could not be executed, set error
# message and return false.
#
# - If the command was killed, die.  (This is for the convenience of
# the user hitting Ctrl-C and expecting things to die quickly.)
#
# - Otherwise, store the exit status and output text in the scalars
# passed, and return true only if the exit status was 'success' (0).
# If the exit status was nonzero, set error message and return false.
#
# Before running the command the environment variables LANG and LC_ALL
# will be set to 'C' so that the error messages are consistent no
# matter what the user's language.
#
# For example:
#
# my ($rc, $text, $err);
# my_system('cat file', \$rc, \$text, \$err);
#
# my_system('echo hello') or warn "oh dear: $err";
#
# The idea behind having an 'error message' rather than just die()ing
# is that die() is reserved for the user interrupting the program or
# for things that 'cannot happen'.
#
sub my_system($;$$$) {
    die 'usage: my_system(command, [scalar ref, [scalar ref, [scalar ref]]])'
      unless @_ >= 1 and @_ <= 4;
    my ($cmd, $statusp, $outp, $errp) = @_;
    local $ENV{LANG} = 'C';
    local $ENV{LC_ALL} = 'C';

    # Set the returned values to undef to start with.
    foreach ($statusp, $outp, $errp) { $$_ = undef if $_ }

    # Stick '|' onto the end of the command, so we can capture its
    # stdout.  Note that if you pass in a command looking like
    # 'foo >file0', then the output will indeed go into file0, and
    # my_system() won't see any of it.
    #
    local *CMD;
    open (CMD, "$cmd |")
      or $$errp = "cannot execute '$cmd |': $!", return 0;

    if ($outp) {
	# Slurp all of the command's output
	local $/ = undef;
	$$outp = <CMD>;
    }
    else {
	# Just run the command to completion
	0 while <CMD>;
    }

    # Check the exit status of the command
    if (not close CMD) {
	# Something went wrong (maybe just exit status)
	my ($status, $sig, $core) = ($? >> 8, $? & 127, $? & 128);

	if ($core) {
	    $$errp = "'$cmd |' dumped core\n";
	    return 0;
	}
	elsif ($sig) {
	    die "'$cmd |' killed by signal $sig";
	}
	elsif ($status) {
	    # Command returned an error code
	    $$statusp = $status if $statusp;
	    $$errp = "'$cmd |' returned error code $status";
	    return 0;
	}
	else {
	    # Shouldn't get here.
	    die "strange error from '$cmd |': $! (close() returned $?)";
	}
    }
    else {
	# Everything went okay.
	$$statusp = 0;

	# Just double-check that things are set correctly.
	if ($statusp) { die "status not set" if not defined $$statusp }
	if ($outp)    { die "out not set"    if not defined $$outp    }
	if ($errp)    { die "error set"      if     defined $$errp    }

	# Return true, for success.
	return 1;
    }
}

