#!/usr/bin/perl

=head1 NAME

splitfs - split directory trees into files with total size limits

=head1 SYNOPSIS

B<splitfs> I<bytecount> I<dirname> [I<dirname> ... ]

=head1 DESCRIPTION

The perl script recursively looks at all the files and directories in each of
the I<dirname> directories. It collects one or more of these
file/directory-names and writes them into report files as long as the total
byte count of all of them in each report file stays below the I<bytecount>
given as the first command line argument. The file names of the report
files (which are either truncated or created for writing) are derived from
the name of the perl script by appending three numerical letters; the
first report file is splitfs000, the second splitfs001 and so on.

The total number of report files created depends on I<bytecount>
(the larger I<bytecount> the smaller the number of files, and the larger
the number of files listed in the report files), on the total number
of directories and files in the assembly I<dirname>'s, and on the 
(average) number of bytes in the individual files in the I<dirname>.

If directories (ie the files that contain their file names and inodes)
or files contain more than I<bytecount> bytes, their names are not put
into any of the report files. A diagnostic error is written to stderr
for each of those.

The program does not optimize the aggregation into the report files
as to bring the total byte count in each of those as close as
possible to I<bytecount>. It will just run recursively through
the directories in the order mentioned on the command line, and skip
to a new report file as soon as the next entry met would overrun
the total number of bytes represented by the files already enlisted
in the current report file.

=head1 EXAMPLE

B<splitfs> 800000 ./bin /tmp/mydir

creates files splitfs000, splitfs001 and so on where each of the splitfs???
files contains file names (one per line) ./bin* and/or /tmp/mydir*
and the total byte count of the files mentioned in splitfs000, splitfs001
and so on is less than 800 kBytes.

=head1 CAUTIONS

The total number of report files will not be larger than 1000 (the last
one baptized splitfs999).

Files within the I<dirname> directories that are symbolic links to
other files will be counted with the full size of the file they point
to, and be listed in the report files (unless this full size is larger
than I<bytecount>).

The program does not follow symbolic links to directories
found within the I<dirname> directories. 

The total number of bytes reserved for files in blocked modes
(as with du(1M) or dd(1) or tar(1)) is larger, which is not considered
here.

=head1 AUTHOR

Richard J. Mathar, I<http://www.strw.leidenuniv.nl/~mathar>

Feb. 8, 2001

=cut

use IO::File ;

$gotbytes = 0 ;
$follow = 0 ;			# don't follow symbolic links to directories
$outfisuff = $0."000" ;		# start outfile names as splitfs0, then proceed with splitfs1 etc

if ( @ARGV < 2 )		# simple syntax check
{
	print "usage: $0 <bytecount> <dirname> [<dirname> ... ]\n" ;
	exit ;
}

$wantbytes = $ARGV[0] ;		# maximum number of bytes in each assembly

if ( $wantbytes =~ m/\D/ )	# another syntax check: $wantbytes must only contain digits 0-9
{
	print "$0: $wantbytes not a positive number\n" ;
	exit ;
}

shift( @ARGV ) ;		# remove number of bytes from ARGV

$outfh = new IO::File ;
open($outfh,">$outfisuff") ;
foreach $dir ( @ARGV )		# one directory entry at a time
{
	($gotbytes,$outfh) = recurdir($gotbytes,$wantbytes,$dir,$outfh) ;
}
close($outfh) ;
exit ;

# Run recursively through directories
# Don't follow symbolic links that are directories, but list symbolic links to files
# and count their 'size' (ie, the size of the file they point to) against the bytes..
sub recurdir ()
{
	my ($gotb, $wantb, $dir, $outfh) = @_ ;
	my $entr ;
	opendir(d,$dir) ;
	my @fils = readdir(d) ;	# get all files in the directory
	@fils = nodots(@fils) ;	# remove the parent directory from the listing
	foreach $entr ( @fils)
	{
		my $fullfils = $dir . "/" . $entr ;
		if ( -d $fullfils and $entr ne "." and ( $follow == 1 or ! -l $fullfils ) )
		{
			# if file is a dirctory and either the 'follow' flag is set or the directory isn't a symbolic link:
			# then recursive call
			($gotb,$outfh) = recurdir ($gotb,$wantb,$fullfils,$outfh) ;
		}
		else
		{
			# either a plain file or the $dir/. directory itself
			my $filsby = -s $fullfils ;	# how large is it?
			if ( $filsby > $wantb )				# doesn't fit in any list of size $wantb
			{
				print STDERR "skipping $fullfils with $filsby bytes\n" ;
			}
			else
			{
				if ( $gotb + $filsby > $wantb )		# doesn't fit in current list
				{
						# close old file, open new file with "next" name
					close($outfh) ;
					$outfisuff++ ;
					open($outfh,">$outfisuff") ;
					$gotb = 0 ;
				}
				print $outfh "$fullfils\n";
				$gotb += $filsby ;
			}
		}
	}
	closedir(d) ;
	return ($gotb,$outfh) ;
}

# Remove the '..' entries from the directory listing
sub nodots ()
{
	# print "on entry",@_,"\n" ;
	# print "size ",scalar(@_),"\n" ;
	for($indx =0 ; $indx < scalar(@_) ; )
	{
		# if ( @_[$indx] eq "." || @_[$indx] eq ".." )
		if ( $_[$indx] eq ".." )
		{
			splice(@_,$indx,1) ;
		}
		else
		{
			$indx++ ;
		}
	}
	return (@_) ;
}