#!/usr/bin/perl use warnings; use strict; use LWP::Simple; use HTML::TokeParser; use Log::TraceMessages qw(t d); Log::TraceMessages::check_argv; die "usage: $0 URL...\n" if not @ARGV; LIST: while (@ARGV) { my $url = shift @ARGV; local $SIG{__DIE__} = sub { die "$url: $_[0]"; }; $_ = get($url); die "cannot get $url" if not defined; if (/a listing of all the public mailing lists/) { my @links; my $p = HTML::TokeParser->new(\$_) or die; while (my $t = $p->get_tag('a')) { my $href = $t->[1]{href}; next if not defined $href; next if $href =~ /\Amailto:/; next if $href =~ m{/mailman/admin\z}; push @links, $href; } unshift @ARGV, @links; next LIST; } s/\s+/ /g; if (/No such list/) { warn "'No such list' in $url, skipping\n"; next; } m!
(.+?)
!i) { for ($desc) { $_ = ucfirst $1; tr/\r/ /s; s/<[^>]+>//g; s/ / /g; my $Discussion_of = qr{(?:the |to |[Ww]e (?:mainly |)|[Ff]or |)(?:[Gg]eneral |)[Dd]iscuss(?:ion(?:s|)|es||ing)(?: of| about| on| relating to| and|)}; my $List = qr{(?:(?:[Mm]ailing[ -]|)[Ll]ist|[Ff]orum)}; my $This_list = qr{(?:This |The |A |)(?:is the |)$List(?: that we use to|)}; my $For = qr{(?:intended |)(?:to be|)(?:(?:used |)for|where|dedicated to|about|to provide a forum for discussing|to discuss|oriented toward|purpose is|)}; my $old_len = length; for (;;) { s/^This is a //; s/^(?:$This_list |)(?:[Ii]s (?:$For |)|purpose is |has |)//; s/^$Discussion_of //; $_ = ucfirst; last if length == $old_len; $old_len = length; } s/^\s+//; s/\s+$//; undef $_ if not length; undef $_ if /^To see the collection of prior postings/; } } } my $read_only = 0; for ($desc) { $_ = $heading_desc if not defined; # Don't worry if we found two conflicting descs. if (defined) { if (/copy of/ and /archive/) { warn "list $list_name is a copy of some other list, not writing\n"; next LIST; } if (/deprecated/i) { warn "list $list_name is deprecated, not writing\n"; next LIST; } if (/(?(\S+)\s+[aA]rchives!) { warn "did not see 'visit the X archives' link, skipping\n"; next; } my ($archive_url, $archive_list_name) = ($1, $2); if (lc $archive_list_name ne lc $list_name) { warn "archive list name $archive_list_name doesn't match $list_name, skipping\n"; next; } for (get $archive_url) { die "couldn't get $archive_url" if not defined; s/\s+/ /g; if (/No messages have been posted to this list yet, so the archives are currently empty/) { warn "list $list_name has empty archive, skipping\n"; next LIST; } } my $project_url; if (defined $desc and ($desc =~ s/\s*\((http\S*?)\)// or $desc =~ s/\s*(http\S*)(?:$|\s)/ /)) { $project_url = $1; } my $list_addr; if (m!send e?mail to\s*(.*?)!i) { die "differing addresses in mailto: $1, $2" if $1 ne $2; $list_addr = $1; } /Subscribing to $list_name/i or die "did not see 'Subscribing to $list_name'"; #/$list_name subscribers/i or die "did not see '$list_name subscribers'"; if (not /unsubscribe from $list_name/i) { warn "$url: did not see 'unsubscribe from $list_name', skipping\n"; next; } if (/([^<>]+)(?:<[^>]*>)? list run by/) { die "saw '$1 list run by', not '$list_name run by'" if lc($1) ne lc($list_name) and lc($1) ne 'a mailing'; } if (/([^<>]+)(?:<[^>]*>)? administrative interface/i) { die "saw '$1 administrative interface', not '$list_name administrative interface'" if lc($1) ne lc($list_name); } print "list info page: $url\n"; print "list name: $list_name\n"; print "description: $desc\n" if defined $desc; print "project URL: $project_url\n" if defined $project_url; print "list address: $list_addr\n" if defined $list_addr; print "read only\n" if $read_only; print "\n"; }