#!/usr/bin/perl -w
use strict;
use HTML::TreeBuilder;
# extract HTML elements by name and attribute
# TODO: change the syntax so it's like this: [attr=val] [...] [tag [attr=val] [...]] [...]
# Additional tags would be matched against the matches from the initial
# tag. Then you can extract all 's from within 's, for example.
sub usage
{
die "Usage: $0 file tag attribute value [attribute value]...\n";
}
usage unless @ARGV >= 2;
my ($file_or_url, $tag, @attr_match) = @ARGV;
usage if @attr_match % 2;
my $tree;
if (-f $file_or_url) {
open(my $fh, "<:utf8", $file_or_url)
or die "$0: error opening $file_or_url: $!\n";
$tree = HTML::TreeBuilder->new->parse_file($fh);
}
elsif ($file_or_url eq '-') {
$tree = HTML::TreeBuilder->new->parse_file(*STDIN);
}
else {
use URI::Heuristic qw(uf_uri);
use LWP::UserAgent;
my $url = uf_uri $file_or_url;
my $ua = LWP::UserAgent->new;
$ua->agent('Mozilla'); # google = assholes
my $response = $ua->get($url);
die sprintf "GET '%s' failed: %s", $url, $response->status_line
unless $response->is_success;
$tree = HTML::TreeBuilder->new->parse($response->content);
}
print $_->as_HTML for $tree->look_down(_tag => $tag, @attr_match);