#!/usr/bin/env python import os import random import sre import sys poststart = sre.compile(r'^\[\[meta author="(?:[^\\"]|\\.)*"\]\]') dateline = sre.compile(r'^\[\[meta date="([0-9]+)/([0-9]+)/([0-9]+) +([0-9]+):([0-9]+):([0-9]+) +([AP]M)"\]\]') breakre = sre.compile(r'< *[bB][rR] */>') emptydivre = sre.compile(r'< *div(?:[^"]|"(?:[^\\"]|\\.)*")*> *< */div *>') commentre = sre.compile(r'^Comment by .* at') def dumppost(lines): # Find the date fn_stem = None for l in lines: m = dateline.match(l) if m: hour = int(m.group(4)) if m.group(7).upper() == 'PM': hour = hour + 12 fn_stem = '%02d-%02d-%02d--%02d-%02d-%02d' \ %(int(m.group(3)), int(m.group(1)), int(m.group(2)), hour, int(m.group(5)), int(m.group(6))) fn = '%s.mdwn'%fn_stem # NB: not race-condition-safe! while os.path.exists(fn): fn = '%s-%x.mdwn'%(fn_stem, random.randint(0, 1E30)) file(fn, 'w').writelines(lines) def find_and_dump_entries(lines): currpost = [] for l in lines: if poststart.match(l): if len(currpost) > 0: dumppost(currpost) currpost = [] # Drop the bogus empty divs blogspot inserts and # interpret
to line breaks. remove_divs = emptydivre.sub('', l) if commentre.match(remove_divs): remove_divs = '### ' + remove_divs for real_line in breakre.split(remove_divs): currpost.append(real_line.rstrip() + '\n') if len(currpost) > 0: dumppost(currpost) currpost = [] in_file = file(sys.argv[1]) find_and_dump_entries(in_file)