-
Notifications
You must be signed in to change notification settings - Fork 0
/
cassis.py
174 lines (148 loc) · 7.9 KB
/
cassis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
# -*- coding: utf-8 -*-
import re
import sys
import os.path
import urllib
if sys.version < '3':
from urlparse import urlparse
else:
from urllib.parse import urlparse
xrange = range
def auto_link_re():
return re.compile('(?:\\@[_a-zA-Z0-9]{1,17})|(?:(?:(?:(?:http|https|irc)?:\\/\\/(?:(?:[!$&-.0-9;=?A-Z_a-z]|(?:\\%[a-fA-F0-9]{2}))+(?:\\:(?:[!$&-.0-9;=?A-Z_a-z]|(?:\\%[a-fA-F0-9]{2}))+)?\\@)?)?(?:(?:(?:[a-zA-Z0-9][-a-zA-Z0-9]*\\.)+(?:(?:aero|arpa|asia|a[cdefgilmnoqrstuwxz])|(?:biz|b[abdefghijmnorstvwyz])|(?:cat|com|coop|c[acdfghiklmnoruvxyz])|d[ejkmoz]|(?:edu|e[cegrstu])|f[ijkmor]|(?:gov|g[abdefghilmnpqrstuwy])|h[kmnrtu]|(?:info|int|i[delmnoqrst])|j[emop]|k[eghimnrwyz]|l[abcikrstuvy]|(?:mil|museum|m[acdeghklmnopqrstuvwxyz])|(?:name|net|n[acefgilopruz])|(?:org|om)|(?:pro|p[aefghklmnrstwy])|qa|r[eouw]|s[abcdeghijklmnortuvyz]|(?:tel|travel|t[cdfghjklmnoprtvwz])|u[agkmsyz]|v[aceginu]|w[fs]|y[etu]|z[amw]))|(?:(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[1-9])\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[0-9])\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[0-9])\\.(?:25[0-5]|2[0-4][0-9]|[0-1][0-9]{2}|[1-9][0-9]|[0-9])))(?:\\:\\d{1,5})?)(?:\\/(?:(?:[!#&-;=?-Z_a-z~])|(?:\\%[a-fA-F0-9]{2}))*)?)(?=\\b|\\s|$)', flags=re.IGNORECASE)
# ccTLD compressed regular expression clauses (re)created.
# .mobi .jobs deliberately excluded to discourage layer violations.
# see http://flic.kr/p/2kmuSL for more on the problematic new gTLDs
# part of $re derived from Android Open Source Project, Apache 2.0
# with a bunch of subsequent fixes/improvements (e.g. ttk.me/t44H2)
# thus auto_link_re is also Apache 2.0 licensed
# http://www.apache.org/licenses/LICENSE-2.0
# - Tantek 2010-046 (moved to auto_link_re 2012-062)
def auto_link(text, do_embed=False,maxUrlLength=0):
""" auto_link: param 1: text; param 2: do embeds or not
auto_link is idempotent, works on plain text or typical markup.
"""
regex = auto_link_re()
ms = regex.findall(text)
if not ms:
return text
mlen = len(ms)
sp = regex.split(text)
text = ''
for i in xrange(mlen):
mi = ms[i]
spliti = sp[i]
text = text + spliti
if sp[i + 1].startswith('/'): # regex omits end/ before </a
sp[i + 1] = sp[i + 1][1:]
mi = mi + '/' # include / in the match
spe = spliti[-2:]
# avoid 2x-linking, don't link CSS @-rules, attr values, asciibet
if ((not spe or not re.match('(?:\\=[\\"\\\']?|t;)', spe)) and
sp[i + 1].strip()[:3] != '</a' and
(mi not in ['charset', 'font', 'font-face', 'import', 'media',
'namespace', 'page', 'ABCDEFGHIJKLMNOPQ'])):
afterlink = ''
afterchar = mi[-1]
while (afterchar in '.!?,;"\')]}' and # trim punc from
(afterchar != ')' or '(' not in mi)): # 1 () pair
afterlink = afterchar + afterlink
mi = mi[:-1]
afterchar = mi[-1]
fe = 0
if do_embed:
_, fe = os.path.splitext(mi.lower())
wmi = web_address_to_uri(mi, True)
wp = urlparse(wmi)
prot = wp.scheme + ':'
hn = wp.netloc
pa = wp.path
ih = wmi.startswith('http')
displayUrl = mi
if maxUrlLength:
displayUrl = wp.netloc+wp.path
if wp.query:
displayUrl = displayUrl + '?'+wp.query
if len(displayUrl)> maxUrlLength:
shortUrl = unicode(displayUrl[:maxUrlLength])+u'…'
displayUrl = shortUrl
if (fe and
(fe == '.jpeg' or fe == '.jpg' or fe == '.png' or
fe == '.gif' or fe == '.svg')):
alt = 'a ' + 'photo' if 'photo' in mi else fe[1:]
text = (text + '<a class="auto-link figure" href="' +
wmi + '"><img alt="' + alt + '" src="' +
wmi + '"/></a>' + afterlink)
elif fe and (fe == '.mp4' or fe == '.mov' or fe == '.ogv'):
text = (text + '<a class="auto-link figure" href="' +
wmi, '"><video controls="controls" src="' +
wmi, '"></video></a>' + afterlink)
elif fe and (fe == '.mp3' or fe == '.m4a' or fe == '.ogg' or fe == '.wav'):
text = (text + '<a class="auto-link figure" href="' +
wmi, '"><audio controls="controls" src="' +
wmi, '"></audio></a>' + afterlink)
elif ih and hn == 'vimeo.com' and pa[1:].isdigit():
text = (text + '<a class="auto-link" href="' +
wmi + '">' + displayUrl + '</a> <iframe class="vimeo-player auto-link figure" width="480" height="385" style="border:0" src="' + prot + '//player.vimeo.com/video/' +
pa[1:] + '"></iframe>' + afterlink)
elif (hn == 'youtu.be' or
((hn == 'youtube.com' or hn == 'www.youtube.com')
and 'watch?v=' in mi)):
if hn == 'youtu.be':
yvid = pa[1:]
else:
offs = mi.index('watch?v=')
yvid = mi[offs + 8:].split('&', 1)[0]
text = (text + '<a class="auto-link" href="' +
wmi + '">' + displayUrl + '</a> <iframe class="youtube-player auto-link figure" width="480" height="385" style="border:0" src="' + prot + '//www.youtube.com/embed/' +
yvid + '"></iframe>' +
afterlink)
elif mi.startswith('@'):
if (sp[i + 1][:1] == '.' and
spliti != '' and ctype_email_local(spliti[-1])):
# if email address, simply append info, no linking
text = text + displayUrl + afterlink
else:
# treat it as a Twitter @-username reference and link it
text = (text + '<a class="auto-link h-x-username" href="' +
wmi + '">' + displayUrl + '</a>' +
afterlink)
elif wp.fragment:
fragmentioned = urllib.unquote_plus(wp.fragment)
if ' ' in fragmentioned and do_embed:
if fragmentioned.startswith('#'):
fragmentioned = fragmentioned[1:]
text = (text + '<blockquote class="auto-mention"><a class="auto-link" href="' +
wmi + '"><cite>' + wp.netloc +'</cite><p>' + fragmentioned
+ '</p></a></blockquote>' + afterlink)
else:
text = (text + '<a class="auto-link" href="' +
wmi + '">' + displayUrl + '</a>' +
afterlink)
else:
text = text + mi
return text + sp[mlen]
def web_address_to_uri(wa, addhttp=False):
if (not wa or wa.startswith('http://')
or wa.startswith('https://')
or wa.startswith('irc://')):
return wa
if (wa.startswith('Http://') or wa.startswith('Https://')):
# handle iOS4 overcapitalization of input entries
return 'h' + wa[1:]
# TBI: may want to handle typos as well like:
# missing/extra : or / http:/ http///
# missing letter in protocol: ttps htps htts, ttp htp htt, ir ic rc
# use strtolower(substr($wa, 0, 6)); // handle capitals in URLs
if wa[0] == '@':
return 'https://twitter.com/' + wa[1:]
if addhttp:
wa = 'http://' + wa
return wa
def ctype_email_local(s):
"""close enough. no '.' because this is used for last char of.
"""
return re.match("^[a-zA-Z0-9_%+-]+$", s)
if __name__ == '__main__':
text = auto_link("it's pretty interesting how twitter auto-links it... http://example.com/a_link_(with_parens) vs. (http://example.com/a_link_without)")
print(text)