* Allow unicode characters in URL query strings Fixes #8408 * Alternative approach to unicode support in urls Adds PoC/idea to approch this problem.
This commit is contained in:
		
							parent
							
								
									5092d17f29
								
							
						
					
					
						commit
						6a5e3da6b0
					
				
					 2 changed files with 67 additions and 4 deletions
				
			
		| 
						 | 
				
			
			@ -99,7 +99,7 @@ class Formatter
 | 
			
		|||
  end
 | 
			
		||||
 | 
			
		||||
  def encode_and_link_urls(html, accounts = nil, options = {})
 | 
			
		||||
    entities = Extractor.extract_entities_with_indices(html, extract_url_without_protocol: false)
 | 
			
		||||
    entities = utf8_friendly_extractor(html, extract_url_without_protocol: false)
 | 
			
		||||
 | 
			
		||||
    if accounts.is_a?(Hash)
 | 
			
		||||
      options  = accounts
 | 
			
		||||
| 
						 | 
				
			
			@ -199,6 +199,43 @@ class Formatter
 | 
			
		|||
    result.flatten.join
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def utf8_friendly_extractor(text, options = {})
 | 
			
		||||
    old_to_new_index = [0]
 | 
			
		||||
 | 
			
		||||
    escaped = text.chars.map do |c|
 | 
			
		||||
      output = c.ord.to_s(16).length > 2 ? CGI.escape(c) : c
 | 
			
		||||
      old_to_new_index << old_to_new_index.last + output.length
 | 
			
		||||
      output
 | 
			
		||||
    end.join
 | 
			
		||||
 | 
			
		||||
    # Note: I couldn't obtain list_slug with @user/list-name format
 | 
			
		||||
    # for mention so this requires additional check
 | 
			
		||||
    special = Extractor.extract_entities_with_indices(escaped, options).map do |extract|
 | 
			
		||||
      # exactly one of :url, :hashtag, :screen_name, :cashtag keys is present
 | 
			
		||||
      key = (extract.keys & [:url, :hashtag, :screen_name, :cashtag]).first
 | 
			
		||||
 | 
			
		||||
      new_indices = [
 | 
			
		||||
        old_to_new_index.find_index(extract[:indices].first),
 | 
			
		||||
        old_to_new_index.find_index(extract[:indices].last),
 | 
			
		||||
      ]
 | 
			
		||||
 | 
			
		||||
      has_prefix_char = [:hashtag, :screen_name, :cashtag].include?(key)
 | 
			
		||||
      value_indices = [
 | 
			
		||||
        new_indices.first + (has_prefix_char ? 1 : 0), # account for #, @ or $
 | 
			
		||||
        new_indices.last - 1,
 | 
			
		||||
      ]
 | 
			
		||||
 | 
			
		||||
      next extract.merge(
 | 
			
		||||
        :indices => new_indices,
 | 
			
		||||
        key => text[value_indices.first..value_indices.last]
 | 
			
		||||
      )
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    standard = Extractor.extract_entities_with_indices(text, options)
 | 
			
		||||
 | 
			
		||||
    Extractor.remove_overlapping_entities(special + standard)
 | 
			
		||||
  end
 | 
			
		||||
 | 
			
		||||
  def link_to_url(entity, options = {})
 | 
			
		||||
    url        = Addressable::URI.parse(entity[:url])
 | 
			
		||||
    html_attrs = { target: '_blank', rel: 'nofollow noopener' }
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
| 
						 | 
				
			
			@ -74,6 +74,7 @@ RSpec.describe Formatter do
 | 
			
		|||
    end
 | 
			
		||||
 | 
			
		||||
    context 'given a URL with a query string' do
 | 
			
		||||
      context 'with escaped unicode character' do
 | 
			
		||||
        let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&q=autolink' }
 | 
			
		||||
 | 
			
		||||
        it 'matches the full URL' do
 | 
			
		||||
| 
						 | 
				
			
			@ -81,6 +82,31 @@ RSpec.describe Formatter do
 | 
			
		|||
        end
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      context 'with unicode character' do
 | 
			
		||||
        let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓&q=autolink' }
 | 
			
		||||
 | 
			
		||||
        it 'matches the full URL' do
 | 
			
		||||
          is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓&q=autolink"'
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      context 'with unicode character at the end' do
 | 
			
		||||
        let(:text) { 'https://www.ruby-toolbox.com/search?utf8=✓' }
 | 
			
		||||
 | 
			
		||||
        it 'matches the full URL' do
 | 
			
		||||
          is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=✓"'
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
 | 
			
		||||
      context 'with escaped and not escaped unicode characters' do
 | 
			
		||||
        let(:text) { 'https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink' }
 | 
			
		||||
 | 
			
		||||
        it 'preserves escaped unicode characters' do
 | 
			
		||||
          is_expected.to include 'href="https://www.ruby-toolbox.com/search?utf8=%E2%9C%93&utf81=✓&q=autolink"'
 | 
			
		||||
        end
 | 
			
		||||
      end
 | 
			
		||||
    end
 | 
			
		||||
 | 
			
		||||
    context 'given a URL with parentheses in it' do
 | 
			
		||||
      let(:text) { 'https://en.wikipedia.org/wiki/Diaspora_(software)' }
 | 
			
		||||
 | 
			
		||||
| 
						 | 
				
			
			
 | 
			
		|||
		Loading…
	
	Add table
		
		Reference in a new issue