#!/usr/bin/perl
# Filename:	site_index
# Author:	David Ljung Madison <DaveSource.com>
# See License:	http://MarginalHacks.com/License
  my $VERSION=  1.00;
# Description:	Creates an HTML site index for one or more sites
# Input:	Reads a file (or STDIN) of "domain <tab> path"
use strict;

umask 022;

##################################################
# Setup the variables
##################################################
my $PROGNAME = $0;
$PROGNAME =~ s|.*/||;

# List of [domain,path] pairs, read from file/stdin
my @DOMAINS;

# Regexp of paths to ignore
my $IGNORE;

my $INDEX	= "\.s?html?\$";
my $DEFAULT	= "index.html";
my $SITE	= "Site_Index.html";		# Will overwrite!
my $NO_INDEX	= ".no_index";			# Don't index these directories
my $NO_CONTENTS	= ".no_contents";		# Don't index contents

my $MY_URL	= "http://MarginalHacks.com/#site_index";

#########################
# Usage
#########################
# Read in the domain info into @DOMAINS
sub get_sites {
  my ($file) = @_;
  open(FILE,"<$file") || usage("Can't read input [$file]");
  while (<FILE>) {
    chomp;
    s/^\s+//;
    s/\s+$//;
    my ($dom,$path) = split(/\s+/,$_,2);
    # If this domain is an alias for another domain, just add it to that list
    push(@DOMAINS,[$dom,$path]);
  }
  close FILE;
}

sub usage {
  foreach my $msg (@_) { print STDERR "ERROR:  $msg\n"; }

  my $def_index = $INDEX;
  $def_index =~ s/\./\\./g;

print STDERR <<END_USAGE;


Usage:\t$PROGNAME [-d] <file>
\tBuilds a site index

Reads file (or '-' for STDIN) for domain info:
  domain   path

Options:
  -ignore <regexp>   Ignore these paths/files
  -index <regexp>    Index these types of files [default '$def_index']

Examples:
  -ignore '/images\$'                  Ignore any "images" directories
  -ignore '/(images|thumbnails)\$'     Multiple ignores
  -ignore '/\\.'                       Ignore dot directories
  -index  '\\.(s?html?|txt)\$'          Index .shtm, .shtml, .htm, .html, .txt

END_USAGE
  exit -1;
}

sub parse_args {
  my $file;
  while (my $arg=shift(@ARGV)) {
    if ($arg =~ /^-h$/) { usage(); }
    if ($arg =~ /^-d$/) { $MAIN::DEBUG=1; next; }
    if ($arg =~ /^-ignore(=(.+))?$/) { $IGNORE= $2 ? $3 : shift @ARGV; next; }
    if ($arg =~ /^-index(=(.+))?$/) { $INDEX= $2 ? $3 : shift @ARGV; next; }
    if ($arg =~ /^-./) { usage("Unknown option: $arg"); }
    usage("Too many inputs specified [$arg and $file]") if (defined($file));
    $file=$arg;
  }
  usage("No input defined") unless $file;

  get_sites($file);
}

#########################
# Build a site index
#########################
sub index_last {	# Contents sorter, site_index goes last
  return  1 if $a eq $SITE;
  return -1 if $b eq $SITE;
  return $a cmp $b;
}

sub get_directory {
  my ($domain, $dir, $path) = @_;

  return unless $dir;
  return if $IGNORE && $path =~ /$IGNORE/;

  # Skip it if it's the path of one of our subdomains, let that index it
  foreach my $d ( @DOMAINS ) {
    return if $d->[1] =~ /^$dir./ && $path =~ /^$d->[1]/;
  }

  my @html;

  # Read the directory
  opendir(DIR, $path) || die("[$PROGNAME] Couldn't read directory [$path]\n");
  my @dir = grep(-d "$path/$_" || /$INDEX/ && !/$DEFAULT/,
                 grep(!/^\.{1,2}$/, readdir(DIR)) );
  @dir = grep( "$path/$_" !~ /$IGNORE/, @dir) if $IGNORE;
  closedir(DIR);

  # Handle directories and html
  return unless (@dir);
  push @html, "<ul>\n";
  my $url_path = $path;
  $url_path =~ s|^$dir|http://$domain|g;
  foreach my $file ( sort index_last @dir ) {
    my $name = $file;
    $name =~ s/_/ /g;		# Underbar = space
    $name =~ s|$INDEX||g;	# .html

    unless (-d "$path/$file") {
      push @html, "<a href=$url_path/$file>$name</a><br>\n";
    } elsif (!-f "$path/$file/$NO_INDEX") {
      my @dir_html = get_directory($domain, $dir, "$path/$file")
        unless (-f "$path/$file/$NO_CONTENTS");
      push @html, "<a href=$url_path/$file>$name/</a><br>\n", @dir_html
        if (@dir_html || -f "$path/$file/$DEFAULT");
    }
  }
  push @html, "</ul>\n";
  @html;
}

#########################
# Header/footer
#########################
sub header {
  my ($domain) = @_;
  <<END_OF_HEADER;
<html>
  <head>
    <title>
      Site Index:  $domain
    </title>
  </head>
  <body bgcolor=white>
    Site index for local domains, generated with <a href=$MY_URL>$PROGNAME</a><p>
    <p><hr><p>
END_OF_HEADER
}

sub footer {
  my $date = localtime;
  <<END_OF_FOOTER;
    <p><hr><p>
    Generated on $date;
  </body>
</html>
END_OF_FOOTER
}

##################################################
# Main code
##################################################
sub domains {	# Domain sorter, subdomains go last
  my $ad = $a->[0];
  my $bd = $b->[0];
  my $ad_num = split(/\./,$ad); my $bd_num = split(/\./,$bd);
  return $ad cmp $bd if ($ad_num == $bd_num);
  return $ad_num <=> $bd_num;
}

sub show_site_info {
  my ($site_info,$domain,$path) = @_;
  return unless $site_info->{$domain};

  # Find any domain aliases
  my @domains = grep($_->[1] eq $path, @DOMAINS);
  @domains = map("<a href='http://$_->[0]/'>$_->[0]</a>", @domains);
  my $dom = join(" / ",@domains);

  print SITE "<font size='+1'>$dom:</font><p>\n";
  print SITE @{$site_info->{$domain}};
}

sub main {
  parse_args();

  my %site_info;

  # Get the site index for each site
  my %did;
  foreach my $d ( @DOMAINS ) {
    my ($domain,$path) = @$d;
    next if $did{$path}++;
    @{$site_info{$domain}} = get_directory($domain, $path, $path);
  }

  # Write the site index for each site
  foreach my $d ( @DOMAINS ) {
    my ($domain,$path) = @$d;
    next unless $path;
    next unless $site_info{$domain};
    open(SITE,">$path/$SITE") || die("Can't write site index [$path/$SITE]\n");
    print SITE header($domain);

    show_site_info(\%site_info,$domain,$path);
    foreach my $show_d ( sort domains @DOMAINS ) {
      my ($show_domain,$show_path) = @$show_d;
      next if ($show_domain eq $domain);
      show_site_info(\%site_info,$show_domain,$show_path);
    }
    print SITE footer();
    close(SITE);
    print STDERR "Wrote $path/$SITE\n";
  }
} main();
