#!/usr/bin/perl use warnings; use strict; use LWP::Simple; use HTML::TokeParser; use Log::TraceMessages qw(t d); Log::TraceMessages::check_argv; die "usage: $0 URL...\n" if not @ARGV; LIST: while (@ARGV) { my $url = shift @ARGV; local $SIG{__DIE__} = sub { die "$url: $_[0]"; }; $_ = get($url); die "cannot get $url" if not defined; if (/a listing of all the public mailing lists/) { my @links; my $p = HTML::TokeParser->new(\$_) or die; while (my $t = $p->get_tag('a')) { my $href = $t->[1]{href}; next if not defined $href; next if $href =~ /\Amailto:/; next if $href =~ m{/mailman/admin\z}; push @links, $href; } unshift @ARGV, @links; next LIST; } s/\s+/ /g; if (/No such list/) { warn "'No such list' in $url, skipping\n"; next; } m!([^>]+)!i or die " not seen"; my $title = $1; my $list_name; for ($title) { if (/^([A-Za-z0-9.-]+) Info Page$/) { $list_name = lc $1; } } s/About Red Hat//; if (not /About (.*)/s) { warn "did not see 'About' in $url, skipping\n"; next; } my $after_about = $1; my ($heading_list_name, $heading_desc); if (/>([A-Za-z0-9.-]+)\s+-+\s+([^<]+)</m) { ($heading_list_name, $heading_desc) = (lc $1, $2); } if (defined $heading_list_name) { for ($list_name) { if (not defined) { $_ = $heading_list_name; } elsif ($_ ne $heading_list_name) { die "saw list name $_ in title, $heading_list_name in heading\n"; } } } if (defined $list_name) { if ($after_about !~ /^$list_name/i) { warn "did not see 'About $list_name' in $url, skipping\n"; next; } } else { $after_about =~ /^([^<>]+)/ or die "didn't see list name after 'About'"; $list_name = $1; for ($list_name) { s/^\s+//; s/\s+$//; die 'could not get list name' if not tr/A-Za-z//; } } die if not length $list_name; my $desc; for ($after_about) { if (m!<p>(.+?)</p>!i) { for ($desc) { $_ = ucfirst $1; tr/\r/ /s; s/<[^>]+>//g; s/ / /g; my $Discussion_of = qr{(?:the |to |[Ww]e (?:mainly |)|[Ff]or |)(?:[Gg]eneral |)[Dd]iscuss(?:ion(?:s|)|es||ing)(?: of| about| on| relating to| and|)}; my $List = qr{(?:(?:[Mm]ailing[ -]|)[Ll]ist|[Ff]orum)}; my $This_list = qr{(?:This |The |A |)(?:is the |)$List(?: that we use to|)}; my $For = qr{(?:intended |)(?:to be|)(?:(?:used |)for|where|dedicated to|about|to provide a forum for discussing|to discuss|oriented toward|purpose is|)}; my $old_len = length; for (;;) { s/^This is a //; s/^(?:$This_list |)(?:[Ii]s (?:$For |)|purpose is |has |)//; s/^$Discussion_of //; $_ = ucfirst; last if length == $old_len; $old_len = length; } s/^\s+//; s/\s+$//; undef $_ if not length; undef $_ if /^To see the collection of prior postings/; } } } my $read_only = 0; for ($desc) { $_ = $heading_desc if not defined; # Don't worry if we found two conflicting descs. if (defined) { if (/copy of/ and /archive/) { warn "list $list_name is a copy of some other list, not writing\n"; next LIST; } if (/deprecated/i) { warn "list $list_name is deprecated, not writing\n"; next LIST; } if (/(?<!the )private/ or /do not (?:try to )subscribe/i) { warn "list $list_name is private, not writing\n"; next LIST; } if (/no posting is allowed/i) { $read_only = 1; } } } if (/archive is only available to the list members/ or /archive is currently accessible to members only/) { warn "list $list_name has no archive, not writing\n"; next LIST; } if (not m!To see the collection of prior postings to the list,\s+visit the <a href="([^"]+)">(\S+)\s+[aA]rchives</a>!) { warn "did not see 'visit the X archives' link, skipping\n"; next; } my ($archive_url, $archive_list_name) = ($1, $2); if (lc $archive_list_name ne lc $list_name) { warn "archive list name $archive_list_name doesn't match $list_name, skipping\n"; next; } for (get $archive_url) { die "couldn't get $archive_url" if not defined; s/\s+/ /g; if (/No messages have been posted to this list yet, so the archives are currently empty/) { warn "list $list_name has empty archive, skipping\n"; next LIST; } } my $project_url; if (defined $desc and ($desc =~ s/\s*\((http\S*?)\)// or $desc =~ s/\s*(http\S*)(?:$|\s)/ /)) { $project_url = $1; } my $list_addr; if (m!send e?mail to\s*<a href="mailto:(.*?)">(.*?)</a>!i) { die "differing addresses in mailto: $1, $2" if $1 ne $2; $list_addr = $1; } /Subscribing to $list_name/i or die "did not see 'Subscribing to $list_name'"; #/$list_name subscribers/i or die "did not see '$list_name subscribers'"; if (not /unsubscribe from $list_name/i) { warn "$url: did not see 'unsubscribe from $list_name', skipping\n"; next; } if (/([^<>]+)(?:<[^>]*>)? list run by/) { die "saw '$1 list run by', not '$list_name run by'" if lc($1) ne lc($list_name) and lc($1) ne 'a mailing'; } if (/([^<>]+)(?:<[^>]*>)? administrative interface/i) { die "saw '$1 administrative interface', not '$list_name administrative interface'" if lc($1) ne lc($list_name); } print "list info page: $url\n"; print "list name: $list_name\n"; print "description: $desc\n" if defined $desc; print "project URL: $project_url\n" if defined $project_url; print "list address: $list_addr\n" if defined $list_addr; print "read only\n" if $read_only; print "\n"; }