Fix language detection of non-latin alphabets even at few characters (#10276)

This commit is contained in:
Eugen Rochko 2019-03-15 05:07:09 +01:00 committed by GitHub
parent ba84b6d4d7
commit 1b167707c2
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 26 additions and 9 deletions

View File

@ -4,6 +4,7 @@ class LanguageDetector
include Singleton include Singleton
CHARACTER_THRESHOLD = 140 CHARACTER_THRESHOLD = 140
RELIABLE_CHARACTERS_RE = /[\p{Hebrew}\p{Arabic}\p{Syriac}\p{Thaana}\p{Nko}\p{Han}\p{Katakana}\p{Hiragana}\p{Hangul}]+/m
def initialize def initialize
@identifier = CLD3::NNetLanguageIdentifier.new(1, 2048) @identifier = CLD3::NNetLanguageIdentifier.new(1, 2048)
@ -11,15 +12,14 @@ class LanguageDetector
def detect(text, account) def detect(text, account)
input_text = prepare_text(text) input_text = prepare_text(text)
return if input_text.blank? return if input_text.blank?
detect_language_code(input_text) || default_locale(account) detect_language_code(input_text) || default_locale(account)
end end
def language_names def language_names
@language_names = @language_names = CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }.uniq
CLD3::TaskContextParams::LANGUAGE_NAMES.map { |name| iso6391(name.to_s).to_sym }
.uniq
end end
private private
@ -29,12 +29,29 @@ class LanguageDetector
end end
def unreliable_input?(text) def unreliable_input?(text)
text.size < CHARACTER_THRESHOLD !reliable_input?(text)
end
def reliable_input?(text)
sufficient_text_length?(text) || language_specific_character_set?(text)
end
def sufficient_text_length?(text)
text.size >= CHARACTER_THRESHOLD
end
def language_specific_character_set?(text)
words = text.scan(RELIABLE_CHARACTERS_RE)
if words.present?
words.reduce(0) { |acc, elem| acc + elem.size }.to_f / text.size.to_f > 0.3
else
false
end
end end
def detect_language_code(text) def detect_language_code(text)
return if unreliable_input?(text) return if unreliable_input?(text)
result = @identifier.find_language(text) result = @identifier.find_language(text)
iso6391(result.language.to_s).to_sym if result.reliable? iso6391(result.language.to_s).to_sym if result.reliable?
end end
@ -77,6 +94,6 @@ class LanguageDetector
end end
def default_locale(account) def default_locale(account)
return account.user_locale&.to_sym || I18n.default_locale if account.local? account.user_locale&.to_sym || I18n.default_locale if account.local?
end end
end end

View File

@ -106,11 +106,11 @@ describe LanguageDetector do
end end
describe 'remote user' do describe 'remote user' do
it 'nil for foreign user when language is not present' do it 'detects Korean language' do
string = '안녕하세요' string = '안녕하세요'
result = described_class.instance.detect(string, account_remote) result = described_class.instance.detect(string, account_remote)
expect(result).to eq nil expect(result).to eq :ko
end end
end end