Remove usernames and hashtags from language detection (#3503)
* Add failing specs for hashtag and username extraction in language detector * Remove usernames and hashtags from text before language detection * Handle multiple instances of special case, and reduce whitespace
This commit is contained in:
		
							parent
							
								
									d1e08bd38c
								
							
						
					
					
						commit
						d010e270e6
					
				
					 2 changed files with 47 additions and 2 deletions
				
			
		| 
						 | 
					@ -13,6 +13,10 @@ class LanguageDetector
 | 
				
			||||||
    detected_language_code || default_locale.to_sym
 | 
					    detected_language_code || default_locale.to_sym
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					  def prepared_text
 | 
				
			||||||
 | 
					    simplified_text.strip
 | 
				
			||||||
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  private
 | 
					  private
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def detected_language_code
 | 
					  def detected_language_code
 | 
				
			||||||
| 
						 | 
					@ -20,18 +24,21 @@ class LanguageDetector
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def result
 | 
					  def result
 | 
				
			||||||
    @result ||= @identifier.find_language(text_without_urls)
 | 
					    @result ||= @identifier.find_language(prepared_text)
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def detected_language_reliable?
 | 
					  def detected_language_reliable?
 | 
				
			||||||
    result.reliable?
 | 
					    result.reliable?
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  def text_without_urls
 | 
					  def simplified_text
 | 
				
			||||||
    text.dup.tap do |new_text|
 | 
					    text.dup.tap do |new_text|
 | 
				
			||||||
      URI.extract(new_text).each do |url|
 | 
					      URI.extract(new_text).each do |url|
 | 
				
			||||||
        new_text.gsub!(url, '')
 | 
					        new_text.gsub!(url, '')
 | 
				
			||||||
      end
 | 
					      end
 | 
				
			||||||
 | 
					      new_text.gsub!(Account::MENTION_RE, '')
 | 
				
			||||||
 | 
					      new_text.gsub!(Tag::HASHTAG_RE, '')
 | 
				
			||||||
 | 
					      new_text.gsub!(/\s+/, ' ')
 | 
				
			||||||
    end
 | 
					    end
 | 
				
			||||||
  end
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
| 
						 | 
					@ -1,7 +1,45 @@
 | 
				
			||||||
# frozen_string_literal: true
 | 
					# frozen_string_literal: true
 | 
				
			||||||
 | 
					
 | 
				
			||||||
require 'rails_helper'
 | 
					require 'rails_helper'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
describe LanguageDetector do
 | 
					describe LanguageDetector do
 | 
				
			||||||
 | 
					  describe 'prepared_text' do
 | 
				
			||||||
 | 
					    it 'returns unmodified string without special cases' do
 | 
				
			||||||
 | 
					      string = 'just a regular string'
 | 
				
			||||||
 | 
					      result = described_class.new(string).prepared_text
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      expect(result).to eq string
 | 
				
			||||||
 | 
					    end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    it 'collapses spacing in strings' do
 | 
				
			||||||
 | 
					      string = 'The formatting   in    this is very        odd'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      result = described_class.new(string).prepared_text
 | 
				
			||||||
 | 
					      expect(result).to eq 'The formatting in this is very odd'
 | 
				
			||||||
 | 
					    end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    it 'strips usernames from strings before detection' do
 | 
				
			||||||
 | 
					      string = '@username Yeah, very surreal...! also @friend'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      result = described_class.new(string).prepared_text
 | 
				
			||||||
 | 
					      expect(result).to eq 'Yeah, very surreal...! also'
 | 
				
			||||||
 | 
					    end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    it 'strips URLs from strings before detection' do
 | 
				
			||||||
 | 
					      string = 'Our website is https://example.com and also http://localhost.dev'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      result = described_class.new(string).prepared_text
 | 
				
			||||||
 | 
					      expect(result).to eq 'Our website is and also'
 | 
				
			||||||
 | 
					    end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					    it 'strips #hashtags from strings before detection' do
 | 
				
			||||||
 | 
					      string = 'Hey look at all the #animals and #fish'
 | 
				
			||||||
 | 
					
 | 
				
			||||||
 | 
					      result = described_class.new(string).prepared_text
 | 
				
			||||||
 | 
					      expect(result).to eq 'Hey look at all the and'
 | 
				
			||||||
 | 
					    end
 | 
				
			||||||
 | 
					  end
 | 
				
			||||||
 | 
					
 | 
				
			||||||
  describe 'to_iso_s' do
 | 
					  describe 'to_iso_s' do
 | 
				
			||||||
    it 'detects english language for basic strings' do
 | 
					    it 'detects english language for basic strings' do
 | 
				
			||||||
      strings = [
 | 
					      strings = [
 | 
				
			||||||
| 
						 | 
					
 | 
				
			||||||
		Loading…
	
	Add table
		
		Reference in a new issue