#
#
# Python script to convert simplephpblog to movabletype export format.
# I wrote this for importing my simplephpblog into b2evolution, which has a movabletype
# importer, so it has not actually been tested with movabletype.
#
# Directions:
# 1) Download your 'content' directory tree from your hosting provider. If you don't know
# how to do this, try googling "wget" as a starting point. If you do use wget, be sure
# to enable recursion (-r) and expand the maximum recursion depth (-l 20)
# 2) Place this script in the same directory as the content directory is in
# 3) Modify defaults, below
# 4) Cross fingers
# 5) Run the script. The output is on standard io, so you will need to redirect it to a file,
# e.g. "python sphp2mt.py > myblogcontent.txt"
#
# By mypalmike
# fixed by flowolf, to handle the multiple categories issue in sphpblog.
# first category is taken. rest omitted.
#
import os
import re
from datetime import datetime, timedelta
# Set your defaults here
defaultauthor = 'mypalmike'
alwaysoverrideauthor = 1
categorymap = { 1 : 'News',
2 : 'Hardware Gadgets',
8 : 'Software',
3 : 'Life & Thoughts',
555 : 'bla' }
imagesurl = 'http://myblog.com/images/'
delta = timedelta( hours = +1 ) # Offset from UTC - Too lazy to do real timezones
spammers = set( ['loan payday', 'Casino game', 'Sprint ringtone', 'Instant loan', 'google' ] )
# You should only need to modify below to add functionality and/or fix bugs.
# Compile regular expressions once here.
regex_simpletag = re.compile( '\[(/?([bi]|h\d|strike))\]' )
regex_moretag = re.compile( '\[more\]' )
regex_imgtag = re.compile( '\[img=(.*?)( popup=(true|false))?\]' )
regex_urltag = re.compile( '\[url=(.*?)\]((.|[\r\n])*?)\[/url\]' )
regex_htmltag = re.compile('\[html\]((.|[\r\n])*?)\[/html\]' )
regex_images = re.compile( 'images/(.*)' )
regex_sglquote = re.compile( ''' )
regex_dblquote = re.compile( '"' )
regex_ampersand = re.compile( '&' )
regex_lt = re.compile( '<' )
regex_gt = re.compile( '>' )
# Callback for filtering [html] tags
def htmlfilter( match ):
htmlCode = match.group(1)
htmlCode = regex_lt.sub( '<', htmlCode )
htmlCode = regex_gt.sub( '>', htmlCode )
return htmlCode
# Callback for filtering [img] tags
def imgfilter( match ):
srcimagesurl = match.group(1)
return ''
# Get a dictionary of key-value pairs from bar-separated string
def parse( keyvaluepairs ):
dict = {}
parts = keyvaluepairs.split( '|' )
idx = 0
length = len(parts)
while idx < length:
dict[parts[idx]] = parts[idx+1]
idx += 2
return dict
def filterBody( text ):
newtext = text
newtext = regex_simpletag.sub( r'<\1>', newtext )
newtext = regex_moretag.sub( '', newtext )
newtext = regex_imgtag.sub( imgfilter, newtext )
newtext = regex_urltag.sub( r'\2', newtext )
newtext = regex_sglquote.sub( "'", newtext )
newtext = regex_dblquote.sub( '"', newtext )
newtext = regex_ampersand.sub( '&', newtext )
newtext = regex_htmltag.sub( htmlfilter, newtext )
return newtext
def filterComment( text ):
newtext = text
newtext = regex_simpletag.sub( r'<\1>', newtext )
newtext = regex_urltag.sub( r'\2', newtext )
newtext = regex_sglquote.sub( "'", newtext )
newtext = regex_dblquote.sub( '"', newtext )
newtext = regex_ampersand.sub( '&', newtext )
return newtext
def emitentry( entry ):
dict = parse( entry )
author = defaultauthor
if dict.has_key( 'AUTHOR' ) and ( not alwaysoverrideauthor ):
author = dict ['AUTHOR']
title = dict[ 'SUBJECT' ]
epochTimestamp = float( dict[ 'DATE' ] )
timestamp = datetime.utcfromtimestamp( epochTimestamp ) + delta
category = 'Uncategorized'
if dict.has_key( 'CATEGORIES' ):
categoryNum = int( ((dict['CATEGORIES']).split(','))[0] )
if categorymap.has_key( categoryNum ):
category = categorymap[ categoryNum ]
# Author must come first for b2evolution importer...
print 'AUTHOR:', author
print 'TITLE:', title
print 'DATE:', timestamp.strftime( "%m/%d/%Y %H:%M:%S")
print 'PRIMARY CATEGORY:', category
# print 'STATUS: publish'
# print 'ALLOW COMMENTS: 1'
# print 'ALLOW PINGS: 0'
print '-----'
print 'BODY:'
print filterBody( dict['CONTENT'] )
print '-----'
def emitcomment( comment ):
dict = parse( comment )
if( dict ['NAME'] in spammers ):
return
epochTimestamp = float( dict[ 'DATE' ] )
timestamp = datetime.utcfromtimestamp( epochTimestamp ) + delta
print 'COMMENT:'
print 'AUTHOR:', dict ['NAME']
print 'DATE:', timestamp.strftime( "%m/%d/%Y %H:%M:%S")
if( dict.has_key('URL') ):
print 'URL:', dict[ 'URL' ]
if( dict.has_key('IP') ):
print 'IP:', dict[ 'IP' ]
if( dict.has_key('EMAIL') ):
print 'EMAIL:', dict[ 'EMAIL' ]
print filterComment( dict[ 'CONTENT' ] )
print '-----'
def convert():
for root, dirs, files in os.walk('content'):
for name in files:
if name.startswith( 'entry' ):
entrydir = name.split( '.' )[0]
entryfile = open( root + '/' + name, 'r' )
entryval = entryfile.read()
entryfile.close()
# print entryval
emitentry( entryval )
for root2, dirs2, files2 in os.walk(root + '/' + entrydir):
for name2 in files2:
if name2.startswith( 'comment' ):
commentfile = open( root2 + '/' + name2, 'r' )
commentval = commentfile.read()
commentfile.close()
emitcomment( commentval )
print '--------'
# Go
convert()