Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Ensure normalized IDNA domains return ASCII strings #90

Merged
merged 23 commits into from
Sep 15, 2016
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
157 changes: 132 additions & 25 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -27,33 +27,132 @@ Usage (this output was created with [`examples/url.rb`][examples]):
require "twingly/url"

url = Twingly::URL.parse("http://www.twingly.co.uk/search")
url.scheme # => "http"
url.trd # => "www"
url.sld # => "twingly"
url.tld # => "co.uk"
url.ttld # => "uk"
url.domain # => "twingly.co.uk"
url.host # => "www.twingly.co.uk"
url.origin # => "http://www.twingly.co.uk"
url.path # => "/search"
url.without_scheme # => "//www.twingly.co.uk/search"
url.valid? # => "true"
url.scheme # => "http"
url.normalized.scheme # => "http"
url.trd # => "www"
url.normalized.trd # => "www"
url.sld # => "twingly"
url.normalized.sld # => "twingly"
url.tld # => "co.uk"
url.normalized.tld # => "co.uk"
url.ttld # => "uk"
url.normalized.ttld # => "uk"
url.domain # => "twingly.co.uk"
url.normalized.domain # => "twingly.co.uk"
url.host # => "www.twingly.co.uk"
url.normalized.host # => "www.twingly.co.uk"
url.origin # => "http://www.twingly.co.uk"
url.normalized.origin # => "http://www.twingly.co.uk"
url.path # => "/search"
url.normalized.path # => "/search"
url.without_scheme # => "//www.twingly.co.uk/search"
url.normalized.without_scheme # => "//www.twingly.co.uk/search"
url.userinfo # => ""
url.normalized.userinfo # => ""
url.user # => ""
url.normalized.user # => ""
url.password # => ""
url.normalized.password # => ""
url.valid? # => "true"
url.normalized.valid? # => "true"
url.to_s # => "http://www.twingly.co.uk/search"
url.normalized.to_s # => "http://www.twingly.co.uk/search"

url = Twingly::URL.parse("http://räksmörgås.макдональдс.рф/foo")
url.scheme # => "http"
url.normalized.scheme # => "http"
url.trd # => "räksmörgås"
url.normalized.trd # => "xn--rksmrgs-5wao1o"
url.sld # => "макдональдс"
url.normalized.sld # => "xn--80aalb1aicli8a5i"
url.tld # => "рф"
url.normalized.tld # => "xn--p1ai"
url.ttld # => "рф"
url.normalized.ttld # => "xn--p1ai"
url.domain # => "макдональдс.рф"
url.normalized.domain # => "xn--80aalb1aicli8a5i.xn--p1ai"
url.host # => "räksmörgås.макдональдс.рф"
url.normalized.host # => "xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
url.origin # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
url.normalized.origin # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
url.path # => "/foo"
url.normalized.path # => "/foo"
url.without_scheme # => "//räksmörgås.макдональдс.рф/foo"
url.normalized.without_scheme # => "//xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
url.userinfo # => ""
url.normalized.userinfo # => ""
url.user # => ""
url.normalized.user # => ""
url.password # => ""
url.normalized.password # => ""
url.valid? # => "true"
url.normalized.valid? # => "true"
url.to_s # => "http://räksmörgås.макдональдс.рф/foo"
url.normalized.to_s # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"

url = Twingly::URL.parse("http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo")
url.scheme # => "http"
url.normalized.scheme # => "http"
url.trd # => "xn--rksmrgs-5wao1o"
url.normalized.trd # => "xn--rksmrgs-5wao1o"
url.sld # => "xn--80aalb1aicli8a5i"
url.normalized.sld # => "xn--80aalb1aicli8a5i"
url.tld # => "xn--p1ai"
url.normalized.tld # => "xn--p1ai"
url.ttld # => "xn--p1ai"
url.normalized.ttld # => "xn--p1ai"
url.domain # => "xn--80aalb1aicli8a5i.xn--p1ai"
url.normalized.domain # => "xn--80aalb1aicli8a5i.xn--p1ai"
url.host # => "xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
url.normalized.host # => "xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
url.origin # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
url.normalized.origin # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai"
url.path # => "/foo"
url.normalized.path # => "/foo"
url.without_scheme # => "//xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
url.normalized.without_scheme # => "//xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
url.userinfo # => ""
url.normalized.userinfo # => ""
url.user # => ""
url.normalized.user # => ""
url.password # => ""
url.normalized.password # => ""
url.valid? # => "true"
url.normalized.valid? # => "true"
url.to_s # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"
url.normalized.to_s # => "http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo"

url = Twingly::URL.parse("https://admin:correcthorsebatterystaple@example.com/")
url.scheme # => "https"
url.trd # => ""
url.sld # => "example"
url.tld # => "com"
url.ttld # => "com"
url.domain # => "example.com"
url.host # => "example.com"
url.origin # => "https://example.com"
url.path # => "/"
url.without_scheme # => "//admin:correcthorsebatterystaple@example.com/"
url.userinfo # => "admin:correcthorsebatterystaple"
url.user # => "admin"
url.password # => "correcthorsebatterystaple"
url.valid? # => "true"
url.scheme # => "https"
url.normalized.scheme # => "https"
url.trd # => ""
url.normalized.trd # => "www"
url.sld # => "example"
url.normalized.sld # => "example"
url.tld # => "com"
url.normalized.tld # => "com"
url.ttld # => "com"
url.normalized.ttld # => "com"
url.domain # => "example.com"
url.normalized.domain # => "example.com"
url.host # => "example.com"
url.normalized.host # => "www.example.com"
url.origin # => "https://example.com"
url.normalized.origin # => "https://www.example.com"
url.path # => "/"
url.normalized.path # => "/"
url.without_scheme # => "//admin:correcthorsebatterystaple@example.com/"
url.normalized.without_scheme # => "//admin:correcthorsebatterystaple@www.example.com/"
url.userinfo # => "admin:correcthorsebatterystaple"
url.normalized.userinfo # => "admin:correcthorsebatterystaple"
url.user # => "admin"
url.normalized.user # => "admin"
url.password # => "correcthorsebatterystaple"
url.normalized.password # => "correcthorsebatterystaple"
url.valid? # => "true"
url.normalized.valid? # => "true"
url.to_s # => "https://admin:correcthorsebatterystaple@example.com/"
url.normalized.to_s # => "https://admin:correcthorsebatterystaple@www.example.com/"
```

### Dependencies
Expand All @@ -63,6 +162,14 @@ The gem requires libidn.
sudo apt-get install libidn11 # Ubuntu
brew install libidn # OS X

## Development

To inspect the [Public Suffix List], this handy command can be used (also works in projects that use `twingly-url` as an dependency).

open $(bundle show public_suffix)/data/list.txt

[Public Suffix List]: https://github.com/weppos/publicsuffix-ruby

## Tests

Run tests with
Expand Down
77 changes: 41 additions & 36 deletions examples/url.rb
Original file line number Diff line number Diff line change
@@ -1,43 +1,48 @@
require "bundler/setup"
require_relative "../lib/twingly/url"

url_as_string = "http://www.twingly.co.uk/search"
url = Twingly::URL.parse(url_as_string)
def print_url_details(url_as_string)
url = Twingly::URL.parse(url_as_string)

puts "require \"twingly/url\""
puts "url = Twingly::URL.parse(\"#{url_as_string}\")"
puts "url.scheme # => \"#{url.scheme}\""
puts "url.normalized.scheme # => \"#{url.normalized.scheme}\""
puts "url.trd # => \"#{url.trd}\""
puts "url.normalized.trd # => \"#{url.normalized.trd}\""
puts "url.sld # => \"#{url.sld}\""
puts "url.normalized.sld # => \"#{url.normalized.sld}\""
puts "url.tld # => \"#{url.tld}\""
puts "url.normalized.tld # => \"#{url.normalized.tld}\""
puts "url.ttld # => \"#{url.ttld}\""
puts "url.normalized.ttld # => \"#{url.normalized.ttld}\""
puts "url.domain # => \"#{url.domain}\""
puts "url.normalized.domain # => \"#{url.normalized.domain}\""
puts "url.host # => \"#{url.host}\""
puts "url.normalized.host # => \"#{url.normalized.host}\""
puts "url.origin # => \"#{url.origin}\""
puts "url.normalized.origin # => \"#{url.normalized.origin}\""
puts "url.path # => \"#{url.path}\""
puts "url.normalized.path # => \"#{url.normalized.path}\""
puts "url.without_scheme # => \"#{url.without_scheme}\""
puts "url.normalized.without_scheme # => \"#{url.normalized.without_scheme}\""
puts "url.userinfo # => \"#{url.userinfo}\""
puts "url.normalized.userinfo # => \"#{url.normalized.userinfo}\""
puts "url.user # => \"#{url.user}\""
puts "url.normalized.user # => \"#{url.normalized.user}\""
puts "url.password # => \"#{url.password}\""
puts "url.normalized.password # => \"#{url.normalized.password}\""
puts "url.valid? # => \"#{url.valid?}\""
puts "url.normalized.valid? # => \"#{url.normalized.valid?}\""
puts "url.to_s # => \"#{url.to_s}\""
puts "url.normalized.to_s # => \"#{url.normalized.to_s}\""
end

puts "require \"twingly/url\""
puts

puts "url = Twingly::URL.parse(\"#{url_as_string}\")"
puts "url.scheme # => \"#{url.scheme}\""
puts "url.trd # => \"#{url.trd}\""
puts "url.sld # => \"#{url.sld}\""
puts "url.tld # => \"#{url.tld}\""
puts "url.ttld # => \"#{url.ttld}\""
puts "url.domain # => \"#{url.domain}\""
puts "url.host # => \"#{url.host}\""
puts "url.origin # => \"#{url.origin}\""
puts "url.path # => \"#{url.path}\""
puts "url.without_scheme # => \"#{url.without_scheme}\""
puts "url.valid? # => \"#{url.valid?}\""

print_url_details("http://www.twingly.co.uk/search")
puts

url_as_string = "https://admin:correcthorsebatterystaple@example.com/"
url = Twingly::URL.parse(url_as_string)

puts "url = Twingly::URL.parse(\"#{url_as_string}\")"
puts "url.scheme # => \"#{url.scheme}\""
puts "url.trd # => \"#{url.trd}\""
puts "url.sld # => \"#{url.sld}\""
puts "url.tld # => \"#{url.tld}\""
puts "url.ttld # => \"#{url.ttld}\""
puts "url.domain # => \"#{url.domain}\""
puts "url.host # => \"#{url.host}\""
puts "url.origin # => \"#{url.origin}\""
puts "url.path # => \"#{url.path}\""
puts "url.without_scheme # => \"#{url.without_scheme}\""
puts "url.userinfo # => \"#{url.userinfo}\""
puts "url.user # => \"#{url.user}\""
puts "url.password # => \"#{url.password}\""
puts "url.valid? # => \"#{url.valid?}\""
print_url_details("http://räksmörgås.макдональдс.рф/foo")
puts
print_url_details("http://xn--rksmrgs-5wao1o.xn--80aalb1aicli8a5i.xn--p1ai/foo")
puts
print_url_details("https://admin:correcthorsebatterystaple@example.com/")
37 changes: 37 additions & 0 deletions lib/twingly/public_suffix_list.rb
Original file line number Diff line number Diff line change
@@ -0,0 +1,37 @@
require "public_suffix"

module Twingly
class PublicSuffixList
ACE_PREFIX = /\Axn\-\-/i.freeze

private_constant :ACE_PREFIX

# Extend the PSL with ASCII form of all internationalized domain names
def self.with_punycoded_names
list_data = File.read(PublicSuffix::List::DEFAULT_LIST_PATH)
list = PublicSuffix::List.parse(list_data, private_domains: false)

punycoded_names(list).each do |punycoded_name|
new_rule = PublicSuffix::Rule.factory(punycoded_name)
list.add(new_rule, reindex: false)
end

list.reindex!

list
end

private_class_method \
def self.punycoded_names(list)
names = list.map { |rule| Addressable::IDNA.to_ascii(rule.value) }
names.select { |name| punycoded_name?(name) }
end

private_class_method \
def self.punycoded_name?(name)
PublicSuffix::Domain.name_to_labels(name).any? do |label|
label =~ ACE_PREFIX
end
end
end
end
22 changes: 14 additions & 8 deletions lib/twingly/url.rb
Original file line number Diff line number Diff line change
Expand Up @@ -2,25 +2,28 @@
require "addressable/idna/native"
require "public_suffix"

require_relative "public_suffix_list"
require_relative "url/null_url"
require_relative "url/error"
require_relative "version"

PublicSuffix::List.private_domains = false

module Twingly
class URL
include Comparable

ACCEPTED_SCHEMES = /\Ahttps?\z/i
CUSTOM_PSL = PublicSuffixList.with_punycoded_names
ENDS_WITH_SLASH = /\/+$/
ERRORS_TO_EXTEND = [
Addressable::URI::InvalidURIError,
PublicSuffix::DomainInvalid,
IDN::Idna::IdnaError,
]

private_constant :ACCEPTED_SCHEMES, :ENDS_WITH_SLASH, :ERRORS_TO_EXTEND
private_constant :ACCEPTED_SCHEMES
private_constant :CUSTOM_PSL
private_constant :ENDS_WITH_SLASH
private_constant :ERRORS_TO_EXTEND

class << self
def parse(potential_url)
Expand All @@ -36,9 +39,12 @@ def internal_parse(potential_url)
scheme = addressable_uri.scheme
raise Twingly::URL::Error::ParseError unless scheme =~ ACCEPTED_SCHEMES

display_uri = addressable_display_uri(addressable_uri)
# URLs that can't be normalized should not be valid
try_addressable_normalize(addressable_uri)

public_suffix_domain = PublicSuffix.parse(display_uri.host)
host = addressable_uri.host
public_suffix_domain = PublicSuffix.parse(host, list: CUSTOM_PSL,
default_rule: nil)
raise Twingly::URL::Error::ParseError if public_suffix_domain.nil?

raise Twingly::URL::Error::ParseError if public_suffix_domain.sld.nil?
Expand All @@ -63,8 +69,8 @@ def to_addressable_uri(potential_url)

# Workaround for the following bug in addressable:
# https://github.com/sporkmonger/addressable/issues/224
def addressable_display_uri(addressable_uri)
addressable_uri.display_uri
def try_addressable_normalize(addressable_uri)
addressable_uri.normalize
rescue ArgumentError => error
if error.message.include?("invalid byte sequence in UTF-8")
raise Twingly::URL::Error::ParseError
Expand All @@ -76,7 +82,7 @@ def addressable_display_uri(addressable_uri)
private :new
private :internal_parse
private :to_addressable_uri
private :addressable_display_uri
private :try_addressable_normalize
end

def initialize(addressable_uri, public_suffix_domain)
Expand Down
Loading