#!/usr/bin/perl

# URL Shortener Scraper
# Copyright 2009 Duncan Smith
# BSD license

use warnings;
use strict;

use threads;
use Thread::Queue;

require LWP::UserAgent;

# Host to scrape.
my $host = shift @ARGV;

# Modify this if e.g. the host is all-numeric or case-insensitive.
# Interestingly, if you are scraping a site like shorl.com, who uses
# syllables instead of characters, you can have this be a list of
# those syllables.
my @elems = ("0".."9","a".."z","A".."Z");
if ($ARGV != 0) {
    @elems = @ARGV;
}

print "Fetching from $host with characters ".join(",", @elems)."\n";

#abcdefghijklmnopqrstuvwxyz

# Starting point
my @num = qw/13 0 0 0/;

# Stopping point
my @last = qw/22 0 0 0/;

my $maxthreads = 9;
my $maxmsgs = $maxthreads * 10;

# Queue of URLs (actually references to lists of parameters for URLs) to be fetched
my $stream = Thread::Queue->new();

# Queue of fetched redirects to be written (as string form, ready to write)
my $writer = Thread::Queue->new();

sub convert (@) {
    my $out = "";

    for (@_) {
	$out .= $elems[$_];
    }

    return $out;
}

sub fetch ($$) {
    my ($url, $ua) = @_;
    my $out = "";

    $ua->max_redirect( 0 );

    my $resp = $ua->get($url);
    if ($resp->is_redirect) {
	my $loc = $resp->header('Location');
	return $loc;
    } else {
	return '';
    }
}

sub increment () {
    for (reverse @num) {
	$_++;
	if ($_ == scalar @elems) {
	    $_ = 0;
	    next;
	}
	last;
    }
}

sub go ($$$@) {
    my ($ua, $out, $host, @num) = @_;

    my $url = 'http://' . $host . '/' . convert(@num);
    my $dest = fetch $url, $ua;
    $writer->enqueue(convert(@num) . "|" . $dest . "\n");

}

# This is a worker thread to fetch URLs.  There are $maxthreads of these
sub boot () {
    my $ua = LWP::UserAgent->new(agent => "Eat Delicious Poop");

    while (1) {
	my ($host, @num) = @{$stream->dequeue()};
	my $out = "$host/" . convert(@num);
	go($ua, $out, $host, @num);
    }
}

# This is a worker thread to write fetched URLs.  There is only one of these.
sub writer () {
    open my $fh, '>>', $host . ".txt";
    print "Opened $host.txt\n";


    while (1) {
	my $line = $writer->dequeue();
	if (!defined $line) {
	    threads->exit();
	}
	print "R: " .  $stream->pending . "  W: " . $writer->pending . "  URL: " . $host . "/" . substr(substr($line, 0, 70), 0, -1) . "\n";
	print $fh $line;
    }
}

# Create the thread to write to the file
my $writerthread = threads->create(\&writer);

# Create the worker threads
while ($maxthreads > 0) {
    threads->create(\&boot);
    $maxthreads--;
}

# URL-generating loop
while (1) {
    if ($stream->pending < $maxmsgs) {
	increment;

	my @ary :shared = ($host, @num);
	$stream->enqueue(\@ary);
	if (@num ~~ @last) {
	    last;
	}
    } else {
	sleep 1;
    }
}

while ($stream->pending > 0) {
    sleep 1;
}

$writer->enqueue(undef);

$writerthread->join();

exit;

