try to guess encoding when we add a new table and a new log line when we migrate
This commit is contained in:
parent
41d31cc31d
commit
1ea3f10a41
|
@ -90,6 +90,19 @@ def get_jid(dirname, filename):
|
||||||
jid = jid.lower()
|
jid = jid.lower()
|
||||||
return jid
|
return jid
|
||||||
|
|
||||||
|
def decode_string(string):
|
||||||
|
'''try to decode (to make it Unicode instance) given string'''
|
||||||
|
# by the time we go to iso15 it better be the one else we show bad characters
|
||||||
|
encodings = (sys.getfilesystemencoding(), 'utf-8', 'iso-8859-15')
|
||||||
|
for encoding in encodings:
|
||||||
|
try:
|
||||||
|
string = string.decode(encoding)
|
||||||
|
except UnicodeError:
|
||||||
|
continue
|
||||||
|
return string
|
||||||
|
|
||||||
|
return None
|
||||||
|
|
||||||
def visit(arg, dirname, filenames):
|
def visit(arg, dirname, filenames):
|
||||||
print 'Visiting', dirname
|
print 'Visiting', dirname
|
||||||
for filename in filenames:
|
for filename in filenames:
|
||||||
|
@ -97,9 +110,8 @@ def visit(arg, dirname, filenames):
|
||||||
# notifications are also in contact log file
|
# notifications are also in contact log file
|
||||||
if filename == 'notify.log':
|
if filename == 'notify.log':
|
||||||
continue
|
continue
|
||||||
try:
|
filename = decode_string(filename)
|
||||||
filename.decode('utf-8')
|
if not filename:
|
||||||
except:
|
|
||||||
continue
|
continue
|
||||||
path_to_text_file = os.path.join(dirname, filename)
|
path_to_text_file = os.path.join(dirname, filename)
|
||||||
if os.path.isdir(path_to_text_file):
|
if os.path.isdir(path_to_text_file):
|
||||||
|
@ -137,6 +149,9 @@ def visit(arg, dirname, filenames):
|
||||||
# sent ==> chat_msg_sent, status ==> status
|
# sent ==> chat_msg_sent, status ==> status
|
||||||
type = splitted_line[1] # line[1] has type of logged message
|
type = splitted_line[1] # line[1] has type of logged message
|
||||||
message_data = splitted_line[2:] # line[2:] has message data
|
message_data = splitted_line[2:] # line[2:] has message data
|
||||||
|
message_data = decode_string(message_data)
|
||||||
|
if not message_data:
|
||||||
|
continue
|
||||||
# line[0] is date,
|
# line[0] is date,
|
||||||
|
|
||||||
# some lines can be fucked up, just drop them
|
# some lines can be fucked up, just drop them
|
||||||
|
|
Loading…
Reference in New Issue