#!/usr/bin/perl -w use strict; use HTML::TreeBuilder; # extract HTML elements by name and attribute # TODO: change the syntax so it's like this: [attr=val] [...] [tag [attr=val] [...]] [...] # Additional tags would be matched against the matches from the initial # tag. Then you can extract all 's from within

's, for example. sub usage { die "Usage: $0 file tag attribute value [attribute value]...\n"; } usage unless @ARGV >= 2; my ($file_or_url, $tag, @attr_match) = @ARGV; usage if @attr_match % 2; my $tree; if (-f $file_or_url) { open(my $fh, "<:utf8", $file_or_url) or die "$0: error opening $file_or_url: $!\n"; $tree = HTML::TreeBuilder->new->parse_file($fh); } elsif ($file_or_url eq '-') { $tree = HTML::TreeBuilder->new->parse_file(*STDIN); } else { use URI::Heuristic qw(uf_uri); use LWP::UserAgent; my $url = uf_uri $file_or_url; my $ua = LWP::UserAgent->new; $ua->agent('Mozilla'); # google = assholes my $response = $ua->get($url); die sprintf "GET '%s' failed: %s", $url, $response->status_line unless $response->is_success; $tree = HTML::TreeBuilder->new->parse($response->content); } print $_->as_HTML for $tree->look_down(_tag => $tag, @attr_match);