# # # Python script to convert simplephpblog to movabletype export format. # I wrote this for importing my simplephpblog into b2evolution, which has a movabletype # importer, so it has not actually been tested with movabletype. # # Directions: # 1) Download your 'content' directory tree from your hosting provider. If you don't know # how to do this, try googling "wget" as a starting point. If you do use wget, be sure # to enable recursion (-r) and expand the maximum recursion depth (-l 20) # 2) Place this script in the same directory as the content directory is in # 3) Modify defaults, below # 4) Cross fingers # 5) Run the script. The output is on standard io, so you will need to redirect it to a file, # e.g. "python sphp2mt.py > myblogcontent.txt" # # By mypalmike # fixed by flowolf, to handle the multiple categories issue in sphpblog. # first category is taken. rest omitted. # import os import re from datetime import datetime, timedelta # Set your defaults here defaultauthor = 'mypalmike' alwaysoverrideauthor = 1 categorymap = { 1 : 'News', 2 : 'Hardware Gadgets', 8 : 'Software', 3 : 'Life & Thoughts', 555 : 'bla' } imagesurl = 'http://myblog.com/images/' delta = timedelta( hours = +1 ) # Offset from UTC - Too lazy to do real timezones spammers = set( ['loan payday', 'Casino game', 'Sprint ringtone', 'Instant loan', 'google' ] ) # You should only need to modify below to add functionality and/or fix bugs. # Compile regular expressions once here. regex_simpletag = re.compile( '\[(/?([bi]|h\d|strike))\]' ) regex_moretag = re.compile( '\[more\]' ) regex_imgtag = re.compile( '\[img=(.*?)( popup=(true|false))?\]' ) regex_urltag = re.compile( '\[url=(.*?)\]((.|[\r\n])*?)\[/url\]' ) regex_htmltag = re.compile('\[html\]((.|[\r\n])*?)\[/html\]' ) regex_images = re.compile( 'images/(.*)' ) regex_sglquote = re.compile( ''' ) regex_dblquote = re.compile( '"' ) regex_ampersand = re.compile( '&' ) regex_lt = re.compile( '<' ) regex_gt = re.compile( '>' ) # Callback for filtering [html] tags def htmlfilter( match ): htmlCode = match.group(1) htmlCode = regex_lt.sub( '<', htmlCode ) htmlCode = regex_gt.sub( '>', htmlCode ) return htmlCode # Callback for filtering [img] tags def imgfilter( match ): srcimagesurl = match.group(1) return '' # Get a dictionary of key-value pairs from bar-separated string def parse( keyvaluepairs ): dict = {} parts = keyvaluepairs.split( '|' ) idx = 0 length = len(parts) while idx < length: dict[parts[idx]] = parts[idx+1] idx += 2 return dict def filterBody( text ): newtext = text newtext = regex_simpletag.sub( r'<\1>', newtext ) newtext = regex_moretag.sub( '', newtext ) newtext = regex_imgtag.sub( imgfilter, newtext ) newtext = regex_urltag.sub( r'\2', newtext ) newtext = regex_sglquote.sub( "'", newtext ) newtext = regex_dblquote.sub( '"', newtext ) newtext = regex_ampersand.sub( '&', newtext ) newtext = regex_htmltag.sub( htmlfilter, newtext ) return newtext def filterComment( text ): newtext = text newtext = regex_simpletag.sub( r'<\1>', newtext ) newtext = regex_urltag.sub( r'\2', newtext ) newtext = regex_sglquote.sub( "'", newtext ) newtext = regex_dblquote.sub( '"', newtext ) newtext = regex_ampersand.sub( '&', newtext ) return newtext def emitentry( entry ): dict = parse( entry ) author = defaultauthor if dict.has_key( 'AUTHOR' ) and ( not alwaysoverrideauthor ): author = dict ['AUTHOR'] title = dict[ 'SUBJECT' ] epochTimestamp = float( dict[ 'DATE' ] ) timestamp = datetime.utcfromtimestamp( epochTimestamp ) + delta category = 'Uncategorized' if dict.has_key( 'CATEGORIES' ): categoryNum = int( ((dict['CATEGORIES']).split(','))[0] ) if categorymap.has_key( categoryNum ): category = categorymap[ categoryNum ] # Author must come first for b2evolution importer... print 'AUTHOR:', author print 'TITLE:', title print 'DATE:', timestamp.strftime( "%m/%d/%Y %H:%M:%S") print 'PRIMARY CATEGORY:', category # print 'STATUS: publish' # print 'ALLOW COMMENTS: 1' # print 'ALLOW PINGS: 0' print '-----' print 'BODY:' print filterBody( dict['CONTENT'] ) print '-----' def emitcomment( comment ): dict = parse( comment ) if( dict ['NAME'] in spammers ): return epochTimestamp = float( dict[ 'DATE' ] ) timestamp = datetime.utcfromtimestamp( epochTimestamp ) + delta print 'COMMENT:' print 'AUTHOR:', dict ['NAME'] print 'DATE:', timestamp.strftime( "%m/%d/%Y %H:%M:%S") if( dict.has_key('URL') ): print 'URL:', dict[ 'URL' ] if( dict.has_key('IP') ): print 'IP:', dict[ 'IP' ] if( dict.has_key('EMAIL') ): print 'EMAIL:', dict[ 'EMAIL' ] print filterComment( dict[ 'CONTENT' ] ) print '-----' def convert(): for root, dirs, files in os.walk('content'): for name in files: if name.startswith( 'entry' ): entrydir = name.split( '.' )[0] entryfile = open( root + '/' + name, 'r' ) entryval = entryfile.read() entryfile.close() # print entryval emitentry( entryval ) for root2, dirs2, files2 in os.walk(root + '/' + entrydir): for name2 in files2: if name2.startswith( 'comment' ): commentfile = open( root2 + '/' + name2, 'r' ) commentval = commentfile.read() commentfile.close() emitcomment( commentval ) print '--------' # Go convert()