Merge c9a9ccf8a3
into 48c5663c5f
This commit is contained in:
commit
1e03e36689
|
@ -3879,16 +3879,23 @@ def escape_url(url):
|
||||||
|
|
||||||
|
|
||||||
def read_batch_urls(batch_fd):
|
def read_batch_urls(batch_fd):
|
||||||
|
seen = set()
|
||||||
def fixup(url):
|
def fixup(url):
|
||||||
if not isinstance(url, compat_str):
|
if not isinstance(url, compat_str):
|
||||||
url = url.decode('utf-8', 'replace')
|
url = url.decode('utf-8', 'replace')
|
||||||
BOM_UTF8 = '\xef\xbb\xbf'
|
BOM_UTF8 = '\xef\xbb\xbf'
|
||||||
if url.startswith(BOM_UTF8):
|
if url.startswith(BOM_UTF8):
|
||||||
url = url[len(BOM_UTF8):]
|
url = url[len(BOM_UTF8):]
|
||||||
url = url.strip()
|
if url:
|
||||||
if url.startswith(('#', ';', ']')):
|
if url[0] == '\ufeff':
|
||||||
return False
|
url = url[1:]
|
||||||
|
url = url.lstrip()
|
||||||
|
if url and not url[0] in ('#', ';', ']'):
|
||||||
|
url = url.split('#', 1)[0].rstrip()
|
||||||
|
if not url in seen:
|
||||||
|
seen.add(url)
|
||||||
return url
|
return url
|
||||||
|
return False
|
||||||
|
|
||||||
with contextlib.closing(batch_fd) as fd:
|
with contextlib.closing(batch_fd) as fd:
|
||||||
return [url for url in map(fixup, fd) if url]
|
return [url for url in map(fixup, fd) if url]
|
||||||
|
|
Loading…
Reference in New Issue