#!/usr/bin/perl -wl
use strict;
use HTML::TreeBuilder;
#use LWP::Simple;
use LWP::UserAgent;
use File::Slurp;
use File::Basename;
use Encode;
use Cwd;
use Fcntl qw(F_SETFL O_NONBLOCK :DEFAULT);
use List::Util qw(min);
use List::MoreUtils qw(uniq);
use DB_File;
use constant SLEEP_SECONDS => 30;
use constant VERBOSE => 1;
use constant USE_WGET => 0;
use constant CONF_DIR => "$ENV{HOME}/.4chan-save";
-d CONF_DIR or mkdir CONF_DIR or die "mkdir: $!";
my %URLS;
sub remove_url { delete $URLS{$_} for @_; }
sub add_url { $URLS{$_} = 0 for @_; }
sub schedule_url($$) { my ($url, $time) = @_; exists $URLS{$url} and $URLS{$url} = $time; }
$SIG{INT} = sub { exit };
END { warn "exiting\n"; untie %URLS; }
tie(%URLS, 'DB_File', CONF_DIR . '/argv.db', O_RDWR|O_CREAT, 0600, $DB_HASH) or die $!;
add_url(@ARGV);
#my @OLD_URLS;
#tie(@OLD_URLS, 'DB_File', CONF_DIR . '/argv.old.db', O_RDWR|O_CREAT, 0600, $DB_RECNO) or die $!;
#add_url(@OLD_URLS);
fcntl(STDIN, F_SETFL, O_NONBLOCK) or die $!;
use IO::Select;
my $select = IO::Select->new();
$select->add(\*STDIN);
sub maybe_read_line($&)
{
my ($timeleft, $callback) = @_;
my $done = time + $timeleft;
do {
if ($select->can_read($timeleft)) {
my $line;
if (sysread STDIN, $line, 1024*8) {
if ($line =~ m/\n/) {
chomp $line;
$callback->($line);
} else {
die qq{maybe_read_line: unexpected behavior (raw tty mode?): \$line="$line"};
}
}
}
} while (($timeleft = $done - time) > 0);
}
sub sleep_read
{
my $seconds = shift;
maybe_read_line($seconds, sub {
add_url(grep {m{^https?://.}} split ' ', shift);
});
}
sub wait_url($)
{
my $url = shift or return;
my $when = $URLS{$url} or return;
my $now = time;
return unless $when > $now;
my $seconds = min(SLEEP_SECONDS, $when - $now);
warn "waiting $seconds seconds to fetch $url\n" if VERBOSE;
sleep_read($seconds);
}
my $ua = LWP::UserAgent->new;
$ua->agent('Mozilla/5.0'); # WTF, 4chan??
sub getstore($$)
{
my ($url, $outfile) = @_;
my $response = $ua->mirror(@_);
unless ($response->is_success or $response->code == 404) {
warn sprintf "error fetching '%s': %s\n", $url, $response->status_line;
} elsif (VERBOSE) {
printf "get(%s) => %d\n", $url, $response->code;
}
return $response->code;
}
my $olddir = getcwd or die $!;
my $dest;
while (keys %URLS)
{
for my $url (sort {$URLS{$a} <=> $URLS{$b}} keys %URLS)
{
$url =~ s/#.*//;
$dest = ($url =~ m{/([^/]+)/res/(\d+)/*$}) ? "$1-$2" : int rand(2147483647); # 4chan thread urls end in numbers
-d $dest or mkdir $dest or warn $!;
chdir $dest or warn($!), next;
wait_url($url);
if (USE_WGET) {
print "wget $url" if VERBOSE;
if (system(qw(wget -c -q -N -nd --page-requisites), "$url") != 0) {
remove_url($url) if $? >> 8 == 8;
warn(($? >> 8 == 8) ? "404\n" :
($? == -1) ? "failed to exec wget: $!" :
($? & 127) ? sprintf("wget died with signal %d", $? & 127) :
sprintf("wget returned %d ($?)", $? >> 8));
next;
}
} else {
my $fn = basename($url);
my $tmpname = "$fn~";
my $res = getstore($url, $tmpname);
unless ($res == 200) {
remove_url($url), warn "404\n" if $res == 404;
unlink $tmpname;
next;
}
rename $tmpname, $fn;
}
my $html = Encode::decode_utf8(read_file basename $url) or next;
# print "parse $url" if VERBOSE;
my $parser = HTML::TreeBuilder->new;
$parser->ignore_unknown(0);
my $tree = $parser->parse($html);
my %links;
for ($tree->look_down(_tag => 'a')) {
my $link = $_->attr('href') or next;
next unless $link =~ m{/src/([^/]+)$}; # 4chan image links match this
my $fn = $1;
next if -e $fn;
$links{$link}++;
next;
}
my @links = sort {;no warnings 'numeric'; $a<=>$b} keys %links;
if (USE_WGET) {
system(qw(wget -c -N -nv), @links) if keys %links;
} else {
for my $link (@links) {
next unless $link =~ m{/src/([^/]+)$}; # 4chan image links match this
my $fn = $1;
next if -e $fn;
my $res = getstore($link, $fn);
warn "404\n", last if $res == 404;
}
}
schedule_url($url, time + SLEEP_SECONDS);
} continue {
chdir $olddir or die $!;
rmdir $dest;
}
sleep_read(SLEEP_SECONDS);
}