* Allow unicode characters in URL query strings Fixes #8408 * Alternative approach to unicode support in urls Adds PoC/idea to approch this problem.
This commit is contained in:
		
							parent
							
								
									687a0cbcb0
								
							
						
					
					
						commit
						fdf819b83e
					
				
					 2 changed files with 67 additions and 4 deletions
				
			
		|  | @ -99,7 +99,7 @@ class Formatter | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|   def encode_and_link_urls(html, accounts = nil, options = {}) |   def encode_and_link_urls(html, accounts = nil, options = {}) | ||||||
|     entities = Extractor.extract_entities_with_indices(html, extract_url_without_protocol: false) |     entities = utf8_friendly_extractor(html, extract_url_without_protocol: false) | ||||||
| 
 | 
 | ||||||
|     if accounts.is_a?(Hash) |     if accounts.is_a?(Hash) | ||||||
|       options  = accounts |       options  = accounts | ||||||
|  | @ -199,6 +199,43 @@ class Formatter | ||||||
|     result.flatten.join |     result.flatten.join | ||||||
|   end |   end | ||||||
| 
 | 
 | ||||||
|  |   def utf8_friendly_extractor(text, options = {}) | ||||||
|  |     old_to_new_index = [0] | ||||||
|  | 
 | ||||||
|  |     escaped = text.chars.map do |c| | ||||||
|  |       output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c | ||||||
|  |       old_to_new_index << old_to_new_index.last + output.length | ||||||
|  |       output | ||||||
|  |     end.join | ||||||
|  | 
 | ||||||
|  |     # Note: I couldn't obtain list_slug with @user/list-name format | ||||||
|  |     # for mention so this requires additional check | ||||||
|  |     special = Extractor.extract_entities_with_indices(escaped, options).map do |extract| | ||||||
|  |       # exactly one of :url, :hashtag, :screen_name, :cashtag keys is present | ||||||
|  |       key = (extract.keys & [:url, :hashtag, :screen_name, :cashtag]).first | ||||||
|  | 
 | ||||||
|  |       new_indices = [ | ||||||
|  |         old_to_new_index.find_index(extract[:indices].first), | ||||||
|  |         old_to_new_index.find_index(extract[:indices].last), | ||||||
|  |       ] | ||||||
|  | 
 | ||||||
|  |       has_prefix_char = [:hashtag, :screen_name, :cashtag].include?(key) | ||||||
|  |       value_indices = [ | ||||||
|  |         new_indices.first + (has_prefix_char ? 1 : 0), # account for #, @ or $ | ||||||
|  |         new_indices.last - 1, | ||||||
|  |       ] | ||||||
|  | 
 | ||||||
|  |       next extract.merge( | ||||||
|  |         :indices => new_indices, | ||||||
|  |         key => text[value_indices.first..value_indices.last] | ||||||
|  |       ) | ||||||
|  |     end | ||||||
|  | 
 | ||||||
|  |     standard = Extractor.extract_entities_with_indices(text, options) | ||||||
|  | 
 | ||||||
|  |     Extractor.remove_overlapping_entities(special + standard) | ||||||
|  |   end | ||||||
|  | 
 | ||||||
|   def link_to_url(entity, options = {}) |   def link_to_url(entity, options = {}) | ||||||
|     url        = Addressable::URI.parse(entity[:url]) |     url        = Addressable::URI.parse(entity[:url]) | ||||||
|     html_attrs = { target: '_blank', rel: 'nofollow noopener' } |     html_attrs = { target: '_blank', rel: 'nofollow noopener' } | ||||||
|  |  | ||||||
|  | @ -74,10 +74,36 @@ RSpec.describe Formatter do | ||||||
|     end |     end | ||||||
| 
 | 
 | ||||||
|     context 'given a URL with a query string' do |     context 'given a URL with a query string' do | ||||||
|       let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' } |       context 'with escaped unicode character' do | ||||||
|  |         let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' } | ||||||
| 
 | 
 | ||||||
|       it 'matches the full URL' do |         it 'matches the full URL' do | ||||||
|         is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink"' |           is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink"' | ||||||
|  |         end | ||||||
|  |       end | ||||||
|  | 
 | ||||||
|  |       context 'with unicode character' do | ||||||
|  |         let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓&q=autolink' } | ||||||
|  | 
 | ||||||
|  |         it 'matches the full URL' do | ||||||
|  |           is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓&q=autolink"' | ||||||
|  |         end | ||||||
|  |       end | ||||||
|  | 
 | ||||||
|  |       context 'with unicode character at the end' do | ||||||
|  |         let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓' } | ||||||
|  | 
 | ||||||
|  |         it 'matches the full URL' do | ||||||
|  |           is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓"' | ||||||
|  |         end | ||||||
|  |       end | ||||||
|  | 
 | ||||||
|  |       context 'with escaped and not escaped unicode characters' do | ||||||
|  |         let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink' } | ||||||
|  | 
 | ||||||
|  |         it 'preserves escaped unicode characters' do | ||||||
|  |           is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink"' | ||||||
|  |         end | ||||||
|       end |       end | ||||||
|     end |     end | ||||||
| 
 | 
 | ||||||
|  |  | ||||||
		Loading…
	
	Add table
		
		Reference in a new issue