#!/usr/bin/env python """web2png -- convert a web hierarchy from using GIFs to using PNGs. This script is a front end for gif2png that assists you in converting an entire website. Requires gif2png 1.0.5 or later. Requires python 1.5.2 or later. Requires Unix. by Eric S. Raymond Slightly modified by Aaron Isotton for Debian. """ __version__="$Revision: 1.25 $" # Future library code -- I'm going to submit this for Python 1.6 import sys, os, sys from stat import * if sys.version[:5] < "1.5.2": print "This program requires Python 1.5.2 or later." sys.exit(1) FTW_FILE = 0 # Item is a normal file FTW_DIR = 1 # Item is a directory FTW_CHRDEV = 2 # Item is a character special device file FTW_BLKDEV = 3 # Item is a block special device file FTW_FIFO = 4 # Item is a FIFO FTW_SYMLINK = 5 # Item is a symbolic link FTW_SOCKET = 6 # Item is a socket FTW_NOSTAT = -1 # stat failed on item FTW_DNR = -2 # item was an unreadable directory class FTWException(Exception): pass def ftw_inner(func, path=".", extra=[]): try: st = os.stat(path) except: return FTW_NOSTAT mode = st[ST_MODE] if S_ISDIR(mode): try: subdirs = os.listdir(path) except: return FTW_DNR if S_ISREG(mode): ftype = FTW_FILE if S_ISDIR(mode): ftype = FTW_DIR if S_ISBLK(mode): ftype = FTW_BLKDEV if S_ISCHR(mode): ftype = FTW_CHRDEV if S_ISFIFO(mode): ftype = FTW_FIFO if S_ISLNK(mode): ftype = FTW_SYMLINK if S_ISSOCK(mode): ftype = FTW_SOCKET terminate = apply(func, tuple([path, st, ftype] + list(extra))) if terminate < 0: raise FTWException, terminate if S_ISDIR(mode) and not terminate: for f in subdirs: ftw_inner(func, os.path.join(path, f), extra) def ftw(func, path=".", *extra): """Python tree-walker -- like ftw(3), but more capable.""" try: ftw_inner(func, path, extra) except FTWException, val: return val return 0 def find(dir=".", regexp=None): """Return a tuple list of entries beneath dir matching a given pattern.""" def findhook(path, st, ftype, r, fl): if not r or r.search(path): fl.append((path, ftype)) return 0 flist=[] ftw(findhook, dir, regexp, flist) return flist def findfiles(dir=".", regexp=None): """Return a list of files beneath dir matching a given pattern.""" return map(lambda x: os.path.normpath(x[0]), filter(lambda t: t[1]==FTW_FILE, find(dir, regexp))) def cmp(f1, f2, blocksize=1): # print "Comparing %s with %s" % (f1, f2) # st1 = os.stat(f1); print st1 # st2 = os.stat(f2); print st2 f1 = open(f1, "r") f2 = open(f2, "r") blocknum = 0 try: while 1: blocknum = blocknum + 1 s1 = f1.read(blocksize); s2 = f2.read(blocksize); if not s1 and not s2: break elif s1 != s2: return blocknum finally: f1.close() f2.close() return None # End future library code import re, getopt, commands, string, shutil, urllib nochange = all = None gifre = re.compile(r"\.gif$", re.IGNORECASE) pngre = re.compile(r"\.png$", re.IGNORECASE) pagere = re.compile(r"\.s?html$|\.s?htm$|\.php$|\.inc$|\.css$|\.js$", re.IGNORECASE) bakre = re.compile(r"\.bak$", re.IGNORECASE) # Possible left delimiters of these matches are =, ", (, \s # Possible right delimiters of these matches are ", >, ), \s # .?"? parts deal with the possibility that the optional " might be backslashed delim = r'(?:\\?")?' # Optional double quote, possibly preceded by backslash gif = delim + '([^">]*\.gif)' + delim imgre = re.compile(r'SRC=' + gif, re.IGNORECASE) hrefre = re.compile(r'', re.IGNORECASE) backre = re.compile(r'BACKGROUND=' + gif, re.IGNORECASE) basere = re.compile(r'', re.IGNORECASE) cssurlre = re.compile(r'url\(' + gif + '\)', re.IGNORECASE) def version_controlled(page): # Is given page under version control? return os.path.exists(page + ",v") \ or os.path.exists(os.path.join("RCS", page) + ",v") def web2png(directory): # Convert a web hierarchy rooted on the given directory gifs = findfiles(directory, gifre) htmls = findfiles(directory, pagere) # There's a standard max on the number of arguments we can feed gif2png. # if we see more than these, if len(gifs) > 5120: "web2png: Too many GIFs. Try converting some subtrees first." sys.exit(1) print "This web subtree has", len(gifs), "GIFs and", len(htmls), "pages." # Display information on files we won't convert rejects = \ commands.getoutput("gif2png -w "+transparency+string.join(gifs," ")+" >/dev/null") rejects = re.sub("gif2png: ", " ", rejects) if rejects: if all: print "You are forcing conversion of these GIFs:" else: print "The following GIFs will not be converted:" print rejects # Figure out which files are eligible for conversion if all: giflist = gifs else: giflist = \ string.split(commands.getoutput("gif2png -w "+transparency+string.join(gifs," ")+" 2>/dev/null")) if verbose: print "Giflist: " + repr(giflist) convert_gifs = [] for gif in giflist: png = re.sub(r"\.gif$", r".png", gif) if os.path.exists(png): print "\t%s already has a PNG equivalent" % (gif,) else: convert_gifs.append(gif) # Display information on files we will convert if convert_gifs: print "The following GIFs will be converted:\n\t" + \ string.join(convert_gifs, "\n\t") if not convert_gifs: print "All eligible GIFs seem to have been converted already." # Create a dictionary mapping pages to sets of references to be mapped print "Checking for HTML, PHP JavaScript and include pages that need conversion..." pagecount = 0 page_conversions = {} for file in htmls: fp = open(file, "r") contents = fp.read() fp.close() basedir = basere.search(contents) if basedir: basedir = base.group(1) else: basedir = os.path.dirname(file) matches = imgre.findall(contents) + hrefre.findall(contents) + backre.findall(contents) + cssurlre.findall(contents) if verbose: print "Found %d .gif image(s) references in %s."%(len(matches),file) convert_refs = [] gif_basenames = map(os.path.basename, giflist) for ref in matches: withbase = os.path.join(basedir, ref) # Simple case -- it's a filename, we presume it's local if withbase[:5] != "http:" and withbase[1:] != os.sep: target = os.path.normpath(withbase) if not target in giflist: if verbose: print "Won't replace %s. Not on my list."%target continue # Tricky case -- compare by basename and content else: basename = os.path.basename(withbase[7:]) if not basename in gif_basenames: print "No local match by basename for %s" % (ref,) continue target = giflist[gif_basenames.index(basename)] (data, headers) = urllib.urlretrieve(withbase) not_equal = cmp(data, target) os.remove(data) if not_equal: print "Data of %s and %s don't match at offset %d" % (ref, target, not_equal) continue ref = ref[:-4] convert_refs.append((ref, target)) if convert_refs: print "\tIn %s, I see: %s" % (file, string.join(map(lambda x: x[0]+".gif", convert_refs)," ")) page_conversions[file] = convert_refs pagecount = pagecount + 1 print "%d HTML or PHP page(s) need conversion." % (pagecount,) # Unless user is willing to make changes, we're done now if nochange: return # Convert gifs verbosely if convert_gifs: print "GIF conversions begin:" os.system("gif2png -v -O " + string.join(convert_gifs, " ")) # print "GIF conversions complete" # Now check to see which conversions did not take failures = [] for gif in convert_gifs: png = re.sub(r"\.gif$", r".png", gif) if not os.path.exists(png): failures.append(gif) if failures: print "Some conversions failed:", string.join(failures) # Now hack the references in the web pages for page in page_conversions.keys(): print "Converting %s..." % (page,) if version_controlled(page): os.system("co -l " + page) else: os.system("cp -pf " + page + " " + page + ".bak") fp = open(page, "r") contents = fp.read() fp.close() basedir = basere.search(contents) if basedir: basedir = base.group(1) else: basedir = "" for (ref, target) in page_conversions[page]: if target in giflist and not target in failures: source = '(?P[="(\s])' + ref + r'\.gif(?P[\s"\\>)])' target = r"\g" + ref + r".png\g" print "Source: %s, target %s" % (source, target) contents = re.sub(source, target, contents) fp = open(page, "w") fp.write(contents) fp.close() print "Page conversions complete." def cleanup(directory): # Clean up superfluous .gif and .bak files left over after a conversion map(os.unlink, findfiles(directory, bakre)) pnglist = findfiles(directory, pngre) for png in pnglist: gif = png[:-4] + ".gif" if os.path.exists(gif): os.remove(gif) def unconvert(directory): # Reverse a conversion pnglist = findfiles(directory, pngre) for png in pnglist: gif = png[:-4] + ".gif" if os.path.exists(gif): os.unlink(png) htmls = findfiles(directory, pagere) for page in htmls: if os.path.exists(page + ".bak"): os.rename(page + ".bak", page) elif version_controlled(page): os.system("rcs -u " + page) if __name__ == '__main__': delete = nochange = reverse = verbose = 0 transparency = "" (options, arguments) = getopt.getopt(sys.argv[1:], "adnrtv") if not arguments: arguments = ['.'] for (switch, val) in options: if switch == '-a': all = 1 elif switch == '-d': delete = 1 elif switch == '-n': nochange = 1 elif switch == '-r': reverse = 1 elif switch == '-t': transparency = " -t " elif switch == '-v': verbose = 1 if delete: map(cleanup, arguments) elif reverse: map(unconvert, arguments) else: map(web2png, arguments) # The following sets edit modes for GNU EMACS # Local Variables: # mode:python # End: