#!/usr/bin/perl -w use strict; use HTML::TreeBuilder; use URI::Find; #use constant OUTPUT_PIPE_CMD => undef; # for no pipe use constant OUTPUT_PIPE_CMD => '|tidy -iq -wrap 0'; # convert URLs inside html text elements to links # # e.g., BEFORE:
http://jerkface.net
# AFTER : sub find_uri_ranges { my @uri; my $s = URI::Find->new(sub # lol, isn't this interface great { my ($uri, $orig_uri) = @_; push @uri, $orig_uri; return qq{\0$orig_uri}; } ); my $text = shift; $s->find(\$text) || return (); my @index; while ((my $idx = index $text, "\0") >= 0) { push @index, $idx; substr($text, $idx, 1) = ""; } return map { ($index[$_], length $uri[$_]) } 0 .. $#index; } sub usage { die "Usage: $0 [file]"; } usage if @ARGV > 1; my $file_or_url = shift || '-'; my $tree; if ($file_or_url eq '-') { $tree = HTML::TreeBuilder->new->parse_file(*STDIN); } elsif (-f $file_or_url) { open(my $fh, "<:utf8", $file_or_url) or die "$0: error opening $file_or_url: $!\n"; $tree = HTML::TreeBuilder->new->parse_file($fh); close $fh; } else { use URI::Heuristic qw(uf_uri); use LWP::UserAgent; my $url = uf_uri $file_or_url; my $ua = LWP::UserAgent->new; $ua->agent('Mozilla'); # google = assholes my $response = $ua->get($url); die sprintf "GET '%s' failed: %s", $url, $response->status_line unless $response->is_success; $tree = HTML::TreeBuilder->new->parse($response->content); } $tree->objectify_text(); my @text_els = $tree->look_down(_tag => '~text'); for (@text_els) { my $text = $_->attr('text'); my @ranges = find_uri_ranges $text; my @segments; my $cursor = 0; while (@ranges >= 2) { my ($idx, $len) = splice(@ranges, 0, 2); push @segments, substr($text, $cursor, $idx - $cursor) if $idx; my $url = substr($text, $idx, $len); push @segments, [a => {href => $url}, $url]; $cursor = $idx + $len; } push @segments, substr($text, $cursor) if $cursor < length $text; if (@segments > 1) { $_->replace_with(@segments); } } open STDOUT, OUTPUT_PIPE_CMD or warn $!; print $tree->as_HTML;