Remove usernames and hashtags from language detection (#3503)

* Add failing specs for hashtag and username extraction in language detector

* Remove usernames and hashtags from text before language detection

* Handle multiple instances of special case, and reduce whitespace
This commit is contained in:
Matt Jankowski 2017-06-01 09:29:14 -04:00 committed by GitHub
parent d1e08bd38c
commit d010e270e6
2 changed files with 47 additions and 2 deletions

View File

@ -13,6 +13,10 @@ class LanguageDetector
detected_language_code || default_locale.to_sym detected_language_code || default_locale.to_sym
end end
def prepared_text
simplified_text.strip
end
private private
def detected_language_code def detected_language_code
@ -20,18 +24,21 @@ class LanguageDetector
end end
def result def result
@result ||= @identifier.find_language(text_without_urls) @result ||= @identifier.find_language(prepared_text)
end end
def detected_language_reliable? def detected_language_reliable?
result.reliable? result.reliable?
end end
def text_without_urls def simplified_text
text.dup.tap do |new_text| text.dup.tap do |new_text|
URI.extract(new_text).each do |url| URI.extract(new_text).each do |url|
new_text.gsub!(url, '') new_text.gsub!(url, '')
end end
new_text.gsub!(Account::MENTION_RE, '')
new_text.gsub!(Tag::HASHTAG_RE, '')
new_text.gsub!(/\s+/, ' ')
end end
end end

View File

@ -1,7 +1,45 @@
# frozen_string_literal: true # frozen_string_literal: true
require 'rails_helper' require 'rails_helper'
describe LanguageDetector do describe LanguageDetector do
describe 'prepared_text' do
it 'returns unmodified string without special cases' do
string = 'just a regular string'
result = described_class.new(string).prepared_text
expect(result).to eq string
end
it 'collapses spacing in strings' do
string = 'The formatting in this is very odd'
result = described_class.new(string).prepared_text
expect(result).to eq 'The formatting in this is very odd'
end
it 'strips usernames from strings before detection' do
string = '@username Yeah, very surreal...! also @friend'
result = described_class.new(string).prepared_text
expect(result).to eq 'Yeah, very surreal...! also'
end
it 'strips URLs from strings before detection' do
string = 'Our website is https://example.com and also http://localhost.dev'
result = described_class.new(string).prepared_text
expect(result).to eq 'Our website is and also'
end
it 'strips #hashtags from strings before detection' do
string = 'Hey look at all the #animals and #fish'
result = described_class.new(string).prepared_text
expect(result).to eq 'Hey look at all the and'
end
end
describe 'to_iso_s' do describe 'to_iso_s' do
it 'detects english language for basic strings' do it 'detects english language for basic strings' do
strings = [ strings = [