1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
| # -*- coding: utf-8 -*-
| """Converts an IRI to a URI."""
|
| __author__ = "Joe Gregorio (joe@bitworking.org)"
| __copyright__ = "Copyright 2006, Joe Gregorio"
| __contributors__ = []
| __version__ = "1.0.0"
| __license__ = "MIT"
|
| import urllib.parse
|
| # Convert an IRI to a URI following the rules in RFC 3987
| #
| # The characters we need to enocde and escape are defined in the spec:
| #
| # iprivate = %xE000-F8FF / %xF0000-FFFFD / %x100000-10FFFD
| # ucschar = %xA0-D7FF / %xF900-FDCF / %xFDF0-FFEF
| # / %x10000-1FFFD / %x20000-2FFFD / %x30000-3FFFD
| # / %x40000-4FFFD / %x50000-5FFFD / %x60000-6FFFD
| # / %x70000-7FFFD / %x80000-8FFFD / %x90000-9FFFD
| # / %xA0000-AFFFD / %xB0000-BFFFD / %xC0000-CFFFD
| # / %xD0000-DFFFD / %xE1000-EFFFD
|
| escape_range = [
| (0xA0, 0xD7FF),
| (0xE000, 0xF8FF),
| (0xF900, 0xFDCF),
| (0xFDF0, 0xFFEF),
| (0x10000, 0x1FFFD),
| (0x20000, 0x2FFFD),
| (0x30000, 0x3FFFD),
| (0x40000, 0x4FFFD),
| (0x50000, 0x5FFFD),
| (0x60000, 0x6FFFD),
| (0x70000, 0x7FFFD),
| (0x80000, 0x8FFFD),
| (0x90000, 0x9FFFD),
| (0xA0000, 0xAFFFD),
| (0xB0000, 0xBFFFD),
| (0xC0000, 0xCFFFD),
| (0xD0000, 0xDFFFD),
| (0xE1000, 0xEFFFD),
| (0xF0000, 0xFFFFD),
| (0x100000, 0x10FFFD),
| ]
|
|
| def encode(c):
| retval = c
| i = ord(c)
| for low, high in escape_range:
| if i < low:
| break
| if i >= low and i <= high:
| retval = "".join(["%%%2X" % o for o in c.encode("utf-8")])
| break
| return retval
|
|
| def iri2uri(uri):
| """Convert an IRI to a URI. Note that IRIs must be
| passed in a unicode strings. That is, do not utf-8 encode
| the IRI before passing it into the function."""
| if isinstance(uri, str):
| (scheme, authority, path, query, fragment) = urllib.parse.urlsplit(uri)
| authority = authority.encode("idna").decode("utf-8")
| # For each character in 'ucschar' or 'iprivate'
| # 1. encode as utf-8
| # 2. then %-encode each octet of that utf-8
| uri = urllib.parse.urlunsplit((scheme, authority, path, query, fragment))
| uri = "".join([encode(c) for c in uri])
| return uri
|
|
| if __name__ == "__main__":
| import unittest
|
| class Test(unittest.TestCase):
| def test_uris(self):
| """Test that URIs are invariant under the transformation."""
| invariant = [
| "ftp://ftp.is.co.za/rfc/rfc1808.txt",
| "http://www.ietf.org/rfc/rfc2396.txt",
| "ldap://[2001:db8::7]/c=GB?objectClass?one",
| "mailto:John.Doe@example.com",
| "news:comp.infosystems.www.servers.unix",
| "tel:+1-816-555-1212",
| "telnet://192.0.2.16:80/",
| "urn:oasis:names:specification:docbook:dtd:xml:4.1.2",
| ]
| for uri in invariant:
| self.assertEqual(uri, iri2uri(uri))
|
| def test_iri(self):
| """Test that the right type of escaping is done for each part of the URI."""
| self.assertEqual(
| "http://xn--o3h.com/%E2%98%84",
| iri2uri("http://\N{COMET}.com/\N{COMET}"),
| )
| self.assertEqual(
| "http://bitworking.org/?fred=%E2%98%84",
| iri2uri("http://bitworking.org/?fred=\N{COMET}"),
| )
| self.assertEqual(
| "http://bitworking.org/#%E2%98%84",
| iri2uri("http://bitworking.org/#\N{COMET}"),
| )
| self.assertEqual("#%E2%98%84", iri2uri("#\N{COMET}"))
| self.assertEqual(
| "/fred?bar=%E2%98%9A#%E2%98%84",
| iri2uri("/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}"),
| )
| self.assertEqual(
| "/fred?bar=%E2%98%9A#%E2%98%84",
| iri2uri(iri2uri("/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}")),
| )
| self.assertNotEqual(
| "/fred?bar=%E2%98%9A#%E2%98%84",
| iri2uri(
| "/fred?bar=\N{BLACK LEFT POINTING INDEX}#\N{COMET}".encode("utf-8")
| ),
| )
|
| unittest.main()
|
|