gajim-plural/scripts/migrate_logs_to_dot9_db.py

#!/usr/bin/env python
import os
import sre

from pysqlite2 import dbapi2 as sqlite

if os.name == 'nt':
	try:
		PATH_TO_LOGS_BASE_DIR = os.path.join(os.environ['appdata'], 'Gajim', 'Logs')
		PATH_TO_DB = os.path.join(os.environ['appdata'], 'Gajim', 'logs.db') # database is called logs.db
	except KeyError:
		# win9x
		PATH_TO_LOGS_BASE_DIR = 'Logs'
		PATH_TO_DB = 'logs.db'
else:
	PATH_TO_LOGS_BASE_DIR = os.path.expanduser('~/.gajim/logs')
	PATH_TO_DB = os.path.expanduser('~/.gajim/logs.db') # database is called logs.db

jids_already_in = [] # jid we already put in DB
con = sqlite.connect(PATH_TO_DB)
cur = con.cursor()
# create the tables
# type can be 'gc', 'gcstatus', 'recv', 'sent', 'status'
# logs --> jids.jid_id but Sqlite doesn't do FK etc so it's done in python code
cur.executescript(
	'''
	CREATE TABLE jids(
		jid_id INTEGER PRIMARY KEY AUTOINCREMENT,
		jid TEXT UNIQUE
	);

	CREATE TABLE logs(
		log_line_id INTEGER PRIMARY KEY AUTOINCREMENT UNIQUE,
		jid_id INTEGER,
		contact_name TEXT,
		time INTEGER,
		type TEXT,
		show TEXT,
		message TEXT
	);
	'''
	)

con.commit()

# (?<!\\) is a lookbehind assertion which asks anything but '\'
# to match the regexp that follows it
re = sre.compile(r'(?<!\\)\\n')

def from_one_line(msg):
	# So here match '\\n' but not if you have a '\' before that
	msg = re.sub('\n', msg)
	msg = msg.replace('\\\\', '\\')
	# s12 = 'test\\ntest\\\\ntest'
	# s13 = re.sub('\n', s12)
	# s14 s13.replace('\\\\', '\\')
	# s14
	# 'test\ntest\\ntest'
	return msg


def get_jid(dirname, filename):
	# TABLE NAME will be JID if TC-related, room_jid if GC-related,
	# ROOM_JID/nick if pm-related
	if dirname.endswith('logs') or dirname.endswith('Logs'):
		# we have file (not dir) in logs base dir, so it's TC
		jid = filename # file is JID
	else:
		# we are in a room folder (so it can be either pm or message in room)
		if filename == os.path.basename(dirname): # room/room
			jid = dirname # filename is ROOM_JID
		else: #room/nick it's pm
			jid = dirname + '/' + filename

	if jid.startswith('/'):
		p = len(PATH_TO_LOGS_BASE_DIR)
		jid = jid[p:]
	jid = jid.lower()
	return jid

def visit(arg, dirname, filenames):
	for filename in filenames:
		# Don't take this file into account, this is dup info
		# notifications are also in contact log file
		if filename == 'notify.log':
			continue
		path_to_text_file = os.path.join(dirname, filename)
		if os.path.isdir(path_to_text_file):
			continue

		jid = get_jid(dirname, filename)
		print 'Processing', jid
		# jid is already in the DB, don't create the table, just get his jid_id
		if jid in jids_already_in:
			cur.execute('SELECT jid_id FROM jids WHERE jid="%s"' % jid)
		else:
			jids_already_in.append(jid)
			cur.execute('INSERT INTO jids (jid) VALUES (?)', (jid,))
			con.commit()

			cur.execute('SELECT MAX(jid) FROM jids')
		JID_ID = cur.fetchone()[0]

		f = open(path_to_text_file, 'r')
		lines = f.readlines()
		for line in lines:
			line = from_one_line(line)
			splitted_line = line.split(':')
			if len(splitted_line) > 2:
				# 'gc', 'gcstatus', 'recv', 'sent' and if nothing of those
				# it is status
				type = splitted_line[1] # line[1] has type of logged message
				message_data = splitted_line[2:] # line[2:] has message data
				# line[0] is date,

				# some lines can be fucked up, just trop them
				try:
					tim = int(float(splitted_line[0]))
				except:
					continue

				sql = 'INSERT INTO logs (jid_id, contact_name, time, type, show, message) '\
					'VALUES (?, ?, ?, ?, ?, ?)'

				contact_name = None
				show = None
				if type == 'gc':
					contact_name = message_data[0]
					message = ':'.join(message_data[1:])
				elif type == 'gcstatus':
					contact_name = message_data[0]
					show = message_data[1]
					message = ':'.join(message_data[2:]) # status msg
				elif type in ('recv', 'sent'):
					message = ':'.join(message_data[0:])
				else: # status
					type = 'status'
					show = message_data[0]
					message = ':'.join(message_data[1:]) # status msg

				values = (JID_ID, contact_name, tim, type, show, message)
				cur.execute(sql, values)
				con.commit()

if __name__ == '__main__':
	os.path.walk(PATH_TO_LOGS_BASE_DIR, visit, None)
	f = open(os.path.join(PATH_TO_LOGS_BASE_DIR, 'README'), 'w')
	f.write('We do not use plain-text files anymore, because they do not scale.\n')
	f.write('Those files here are logs for Gajim up until 0.8.2\n')
	f.write('We now use an sqlite database called logs.db found in ~/.gajim\n')
	f.write('You can always run the migration script to import your old logs to the database\n')
	f.write('Thank you\n')
	f.close()
	# after huge import create the indices (they are slow on massive insert)
	cur.executescript(
		'''
		CREATE UNIQUE INDEX jids_already_index ON jids (jid);
		CREATE INDEX JID_ID_Index ON logs (jid_id);
		'''
	)