Remove usernames and hashtags from language detection (#3503)

* Add failing specs for hashtag and username extraction in language detector * Remove usernames and hashtags from text before language detection * Handle multiple instances of special case, and reduce whitespace
2017-06-01 09:29:14 -04:00 · 2017-06-01 09:29:14 -04:00 · d010e270e6
commit d010e270e6
parent d1e08bd38c
2 changed files with 47 additions and 2 deletions
--- a/app/lib/language_detector.rb
+++ b/app/lib/language_detector.rb
@ -13,6 +13,10 @@ class LanguageDetector
    detected_language_code || default_locale.to_sym
  end
  def prepared_text
    simplified_text.strip
  end
  private
  def detected_language_code
@ -20,18 +24,21 @@ class LanguageDetector
  end
  def result
-    @result ||= @identifier.find_language(text_without_urls)
+    @result ||= @identifier.find_language(prepared_text)
  end
  def detected_language_reliable?
    result.reliable?
  end
-  def text_without_urls
+  def simplified_text
    text.dup.tap do |new_text|
      URI.extract(new_text).each do |url|
        new_text.gsub!(url, '')
      end
      new_text.gsub!(Account::MENTION_RE, '')
      new_text.gsub!(Tag::HASHTAG_RE, '')
      new_text.gsub!(/\s+/, ' ')
    end
  end
--- a/spec/lib/language_detector_spec.rb
+++ b/spec/lib/language_detector_spec.rb
@ -1,7 +1,45 @@
 # frozen_string_literal: true
 require 'rails_helper'
 describe LanguageDetector do
  describe 'prepared_text' do
    it 'returns unmodified string without special cases' do
      string = 'just a regular string'
      result = described_class.new(string).prepared_text
      expect(result).to eq string
    end
    it 'collapses spacing in strings' do
      string = 'The formatting   in    this is very        odd'
      result = described_class.new(string).prepared_text
      expect(result).to eq 'The formatting in this is very odd'
    end
    it 'strips usernames from strings before detection' do
      string = '@username Yeah, very surreal...! also @friend'
      result = described_class.new(string).prepared_text
      expect(result).to eq 'Yeah, very surreal...! also'
    end
    it 'strips URLs from strings before detection' do
      string = 'Our website is https://example.com and also http://localhost.dev'
      result = described_class.new(string).prepared_text
      expect(result).to eq 'Our website is and also'
    end
    it 'strips #hashtags from strings before detection' do
      string = 'Hey look at all the #animals and #fish'
      result = described_class.new(string).prepared_text
      expect(result).to eq 'Hey look at all the and'
    end
  end
  describe 'to_iso_s' do
    it 'detects english language for basic strings' do
      strings = [