#
#
# Python script to convert simplephpblog to movabletype export format.
# I wrote this for importing my simplephpblog into b2evolution, which has a movabletype
# importer, so it has not actually been tested with movabletype.
#
# Directions:
# 1) Download your 'content' directory tree from your hosting provider.  If you don't know
#    how to do this, try googling "wget" as a starting point.  If you do use wget, be sure
#    to enable recursion (-r) and expand the maximum recursion depth (-l 20)
# 2) Place this script in the same directory as the content directory is in
# 3) Modify defaults, below
# 4) Cross fingers
# 5) Run the script.  The output is on standard io, so you will need to redirect it to a file,
#    e.g. "python sphp2mt.py > myblogcontent.txt"
#
# By mypalmike
# fixed by flowolf, to handle the multiple categories issue in sphpblog. 
#                   first category is taken. rest omitted.
# 
import os
import re
from datetime import datetime, timedelta

# Set your defaults here
defaultauthor = 'mypalmike'
alwaysoverrideauthor = 1
categorymap = { 1 : 'News', 
		2 : 'Hardware Gadgets', 
		8 : 'Software', 
		3 : 'Life & Thoughts', 
		555 : 'bla' }
imagesurl = 'http://myblog.com/images/'
delta = timedelta( hours = +1 ) # Offset from UTC - Too lazy to do real timezones
spammers = set( ['loan payday', 'Casino game', 'Sprint ringtone', 'Instant loan', 'google' ] )

# You should only need to modify below to add functionality and/or fix bugs.

# Compile regular expressions once here.
regex_simpletag = re.compile( '\[(/?([bi]|h\d|strike))\]' )
regex_moretag = re.compile( '\[more\]' )
regex_imgtag = re.compile( '\[img=(.*?)( popup=(true|false))?\]' )
regex_urltag = re.compile( '\[url=(.*?)\]((.|[\r\n])*?)\[/url\]' )
regex_htmltag = re.compile('\[html\]((.|[\r\n])*?)\[/html\]' )
regex_images = re.compile( 'images/(.*)' )
regex_sglquote = re.compile( '&#039;' )
regex_dblquote = re.compile( '&quot;' )
regex_ampersand = re.compile( '&amp;' )
regex_lt = re.compile( '&lt;' )
regex_gt = re.compile( '&gt;' )

# Callback for filtering [html] tags
def htmlfilter( match ):
	htmlCode = match.group(1)
	htmlCode = regex_lt.sub( '<', htmlCode )
	htmlCode = regex_gt.sub( '>', htmlCode )
	return htmlCode

# Callback for filtering [img] tags
def imgfilter( match ):
	srcimagesurl = match.group(1)
	return '<img src="' + regex_images.sub( r'' + imagesurl + r'/\1', srcimagesurl ) + '"/>'

# Get a dictionary of key-value pairs from bar-separated string
def parse( keyvaluepairs ):
	dict = {}
	parts = keyvaluepairs.split( '|' )
	idx = 0
	length = len(parts)
	while idx < length:
		dict[parts[idx]] = parts[idx+1]
		idx += 2
	return dict

def filterBody( text ):
	newtext = text
	newtext = regex_simpletag.sub( r'<\1>', newtext )
	newtext = regex_moretag.sub( '<!--more-->', newtext )
	newtext = regex_imgtag.sub( imgfilter, newtext )
	newtext = regex_urltag.sub( r'<a href="\1">\2</a>', newtext )
	newtext = regex_sglquote.sub( "'", newtext )
	newtext = regex_dblquote.sub( '"', newtext )
	newtext = regex_ampersand.sub( '&', newtext )
	newtext = regex_htmltag.sub( htmlfilter, newtext )
	return newtext

def filterComment( text ):
	newtext = text
	newtext = regex_simpletag.sub( r'<\1>', newtext )
	newtext = regex_urltag.sub( r'<a href="\1">\2</a>', newtext )
	newtext = regex_sglquote.sub( "'", newtext )
	newtext = regex_dblquote.sub( '"', newtext )
	newtext = regex_ampersand.sub( '&', newtext )
	return newtext

def emitentry( entry ):
	dict = parse( entry )
	author = defaultauthor
	if dict.has_key( 'AUTHOR' ) and ( not alwaysoverrideauthor ):
		author = dict ['AUTHOR']
	title = dict[ 'SUBJECT' ]
	epochTimestamp = float( dict[ 'DATE' ] )
	timestamp = datetime.utcfromtimestamp( epochTimestamp ) + delta
	category = 'Uncategorized'
	if dict.has_key( 'CATEGORIES' ):
		categoryNum = int( ((dict['CATEGORIES']).split(','))[0] )
		if categorymap.has_key( categoryNum ):
			category = categorymap[ categoryNum ]
	# Author must come first for b2evolution importer...
	print 'AUTHOR:', author
	print 'TITLE:', title
	print 'DATE:', timestamp.strftime( "%m/%d/%Y %H:%M:%S")
	print 'PRIMARY CATEGORY:', category
#	print 'STATUS: publish'
#	print 'ALLOW COMMENTS: 1'
#	print 'ALLOW PINGS: 0'
	print '-----'
	print 'BODY:'
	print filterBody( dict['CONTENT'] )
	print '-----'

def emitcomment( comment ):
	dict = parse( comment )

	if( dict ['NAME'] in spammers ):
		return

	epochTimestamp = float( dict[ 'DATE' ] )
	timestamp = datetime.utcfromtimestamp( epochTimestamp ) + delta

	print 'COMMENT:'
	print 'AUTHOR:', dict ['NAME']
	print 'DATE:', timestamp.strftime( "%m/%d/%Y %H:%M:%S")
	if( dict.has_key('URL') ):
		print 'URL:', dict[ 'URL' ]
	if( dict.has_key('IP') ):
		print 'IP:', dict[ 'IP' ]
	if( dict.has_key('EMAIL') ):
		print 'EMAIL:', dict[ 'EMAIL' ]
	print filterComment( dict[ 'CONTENT' ] )
	print '-----'

def convert():
	for root, dirs, files in os.walk('content'):
		for name in files:
			if name.startswith( 'entry' ):
				entrydir = name.split( '.' )[0]
				entryfile = open( root + '/' + name, 'r' )
				entryval = entryfile.read()
				entryfile.close()
#				print entryval
				emitentry( entryval )
				for root2, dirs2, files2 in os.walk(root + '/' + entrydir):
					for name2 in files2:
						if name2.startswith( 'comment' ):
							commentfile = open( root2 + '/' + name2, 'r' )
							commentval = commentfile.read()
							commentfile.close()
							emitcomment( commentval )
				print '--------'

# Go
convert()