Improve deduplication

A stanza-id is only unique within a specific archive
This adds the archive jid to the query
This commit is contained in:
Philipp Hörist 2017-11-18 17:40:09 +01:00
parent cac1185a23
commit 7f1776ede9
2 changed files with 27 additions and 9 deletions

View file

@ -1062,7 +1062,7 @@ class MamMessageReceivedEvent(nec.NetworkIncomingEvent, HelperEvent):
def generate(self): def generate(self):
archive_jid = self.stanza.getFrom() archive_jid = self.stanza.getFrom()
own_jid = self.conn.get_own_jid() own_jid = self.conn.get_own_jid().getStripped()
if archive_jid and not archive_jid.bareMatch(own_jid): if archive_jid and not archive_jid.bareMatch(own_jid):
# MAM Message not from our Archive # MAM Message not from our Archive
return False return False
@ -1076,7 +1076,7 @@ class MamMessageReceivedEvent(nec.NetworkIncomingEvent, HelperEvent):
self.unique_id, origin_id = self.get_unique_id() self.unique_id, origin_id = self.get_unique_id()
# Check for duplicates # Check for duplicates
if app.logger.find_stanza_id(self.unique_id, origin_id): if app.logger.find_stanza_id(own_jid, self.unique_id, origin_id):
return return
self.msgtxt = self.msg_.getTagData('body') self.msgtxt = self.msg_.getTagData('body')
@ -1150,16 +1150,18 @@ class MamGcMessageReceivedEvent(nec.NetworkIncomingEvent, HelperEvent):
self.kind = KindConstant.GC_MSG self.kind = KindConstant.GC_MSG
def generate(self): def generate(self):
self.room_jid = self.stanza.getFrom()
self.msg_ = self.forwarded.getTag('message', protocol=True) self.msg_ = self.forwarded.getTag('message', protocol=True)
if self.msg_.getType() != 'groupchat': if self.msg_.getType() != 'groupchat':
return False return False
self.room_jid = self.stanza.getFrom().getStripped()
self.unique_id = self.get_stanza_id(self.result, query=True) self.unique_id = self.get_stanza_id(self.result, query=True)
# Check for duplicates # Check for duplicates
if app.logger.find_stanza_id(self.unique_id): if app.logger.find_stanza_id(self.room_jid, self.unique_id,
groupchat=True):
return return
self.msgtxt = self.msg_.getTagData('body') self.msgtxt = self.msg_.getTagData('body')
@ -1282,7 +1284,9 @@ class MessageReceivedEvent(nec.NetworkIncomingEvent, HelperEvent):
# Check groupchat messages for duplicates, # Check groupchat messages for duplicates,
# We do this because of MUC History messages # We do this because of MUC History messages
if self.stanza.getType() == 'groupchat': if self.stanza.getType() == 'groupchat':
if app.logger.find_stanza_id(self.unique_id): if app.logger.find_stanza_id(self.stanza.getFrom().getStripped(),
self.unique_id,
groupchat=True):
return return
address_tag = self.stanza.getTag('addresses', address_tag = self.stanza.getTag('addresses',

View file

@ -1088,12 +1088,19 @@ class Logger:
return True return True
return False return False
def find_stanza_id(self, stanza_id, origin_id=None): def find_stanza_id(self, archive_jid, stanza_id, origin_id=None,
groupchat=False):
""" """
Checks if a stanza-id is already in the `logs` table Checks if a stanza-id is already in the `logs` table
:param archive_jid: The jid of the archive the stanza-id belongs to
:param stanza_id: The stanza-id :param stanza_id: The stanza-id
:param origin_id: The origin-id
:param groupchat: stanza-id is from a groupchat
return True if the stanza-id was found return True if the stanza-id was found
""" """
ids = [] ids = []
@ -1105,12 +1112,19 @@ class Logger:
if not ids: if not ids:
return False return False
archive_id = self.get_jid_id(archive_jid)
if groupchat:
column = 'jid_id'
else:
column = 'account_id'
sql = ''' sql = '''
SELECT stanza_id FROM logs SELECT stanza_id FROM logs
WHERE stanza_id IN ({values}) LIMIT 1 WHERE stanza_id IN ({values}) AND {archive} = ? LIMIT 1
'''.format(values=', '.join('?' * len(ids))) '''.format(values=', '.join('?' * len(ids)),
archive=column)
result = self.con.execute(sql, tuple(ids)).fetchone() result = self.con.execute(sql, tuple(ids) + (archive_id,)).fetchone()
if result is not None: if result is not None:
log.info('Found duplicated message, stanza-id: %s, origin-id: %s', log.info('Found duplicated message, stanza-id: %s, origin-id: %s',