forked from cybrespace/mastodon
		
	Improve language filter (#5724)
* Scrub text of html before detecting language. * Detect language on statuses coming from activitypub. * Fix rubocop comments. * Remove custom emoji from text before language detection
This commit is contained in:
		
							parent
							
								
									9e3d24a150
								
							
						
					
					
						commit
						ad207456d6
					
				
					 2 changed files with 26 additions and 7 deletions
				
			
		|  | @ -173,7 +173,7 @@ class ActivityPub::Activity::Create < ActivityPub::Activity | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def language_from_content |   def language_from_content | ||||||
|     return nil unless language_map? |     return LanguageDetector.instance.detect(text_from_content, @account) unless language_map? | ||||||
|     @object['contentMap'].keys.first |     @object['contentMap'].keys.first | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
|  | @ -38,12 +38,31 @@ class LanguageDetector | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def simplify_text(text) |   def simplify_text(text) | ||||||
|     text.dup.tap do |new_text| |     new_text = remove_html(text) | ||||||
|     new_text.gsub!(FetchLinkCardService::URL_PATTERN, '') |     new_text.gsub!(FetchLinkCardService::URL_PATTERN, '') | ||||||
|     new_text.gsub!(Account::MENTION_RE, '') |     new_text.gsub!(Account::MENTION_RE, '') | ||||||
|     new_text.gsub!(Tag::HASHTAG_RE, '') |     new_text.gsub!(Tag::HASHTAG_RE, '') | ||||||
|  |     new_text.gsub!(/:#{CustomEmoji::SHORTCODE_RE_FRAGMENT}:/, '') | ||||||
|     new_text.gsub!(/\s+/, ' ') |     new_text.gsub!(/\s+/, ' ') | ||||||
|  |     new_text | ||||||
|   end |   end | ||||||
|  | 
 | ||||||
|  |   def new_scrubber | ||||||
|  |     scrubber = Rails::Html::PermitScrubber.new | ||||||
|  |     scrubber.tags = %w(br p) | ||||||
|  |     scrubber | ||||||
|  |   end | ||||||
|  | 
 | ||||||
|  |   def scrubber | ||||||
|  |     @scrubber ||= new_scrubber | ||||||
|  |   end | ||||||
|  | 
 | ||||||
|  |   def remove_html(text) | ||||||
|  |     text = Loofah.fragment(text).scrub!(scrubber).to_s | ||||||
|  |     text.gsub!('<br>', "\n") | ||||||
|  |     text.gsub!('</p><p>', "\n\n") | ||||||
|  |     text.gsub!(/(^<p>|<\/p>$)/, '') | ||||||
|  |     text | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def default_locale(account) |   def default_locale(account) | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue