Randomize archive order before populating search tree

This doesn't result in an elegant, perfectly balanced search tree,
but it's absolutely good enough. This commit completely mitigates
the worst-case scenario where the archive file is sorted.

Signed-off-by: Jody Bruchon <jody@jodybruchon.com>
This commit is contained in:
Jody Bruchon 2020-09-17 21:45:40 -04:00
parent 1d74d8d9f6
commit fda63a4e87
1 changed files with 12 additions and 35 deletions

View File

@ -122,17 +122,14 @@ class ArchiveTree(object):
# Tree insertion # Tree insertion
def at_insert(self, line): def at_insert(self, line):
# print("at_insert: ", line)
cur = self cur = self
while True: while True:
# print("comparing ", line, cur.line)
if cur.line: if cur.line:
if line < cur.line: if line < cur.line:
if cur.left is None: if cur.left is None:
cur.left = ArchiveTree(line) cur.left = ArchiveTree(line)
return return
else: else:
# print("LEFT")
cur = cur.left cur = cur.left
continue continue
elif line > cur.line: elif line > cur.line:
@ -140,7 +137,6 @@ class ArchiveTree(object):
cur.right = ArchiveTree(line) cur.right = ArchiveTree(line)
return return
else: else:
# print("RIGHT")
cur = cur.right cur = cur.right
continue continue
else: else:
@ -426,43 +422,24 @@ class YoutubeDL(object):
if ioe.errno != errno.ENOENT: if ioe.errno != errno.ENOENT:
raise raise
lmax = len(lines) lmax = len(lines)
if lmax >= 4: if lmax > 10:
# Populate binary search tree by splitting the archive list in half # Populate binary search tree by splitting the archive list in half
# and then adding from the outside edges inward # and then adding from the outside edges inward
# This mitigates the worst case where the archive has been sorted # This mitigates the worst case where the archive has been sorted
ptrLL = 0 pos = 0
ptrLR = lmax // 2 while pos < lmax:
ptrRL = ptrLR + 1 if lmax - pos <= 2:
ptrRR = lmax - 1
inserted = 0
while True:
# print("ptrs: %d %d %d %d" % (ptrLL, ptrLR, ptrRL, ptrRR))
if ptrLR > ptrLL:
self.archive.at_insert(lines[ptrLR])
inserted += 1
ptrLR -= 1;
if ptrRL < ptrRR:
self.archive.at_insert(lines[ptrRL])
inserted += 1
ptrRL += 1;
if ptrLL < ptrLR:
self.archive.at_insert(lines[ptrLL])
inserted += 1
ptrLL += 1;
if ptrRR > ptrRL:
self.archive.at_insert(lines[ptrRR])
inserted += 1
ptrRR -= 1;
if ptrLL == ptrLR and ptrRL == ptrRR:
print("inserted: %d, lmax: %d" % (inserted, lmax))
break break
elif lmax > 0: target = random.randrange(pos + 1, lmax - 1)
# Skip multi-line logic for a single line temp = lines[pos]
for idx in lines: lines[pos] = lines[target]
self.archive.at_insert(idx) lines[target] = lines[pos]
else: pos += 1
elif lmax < 1:
# No lines were loaded # No lines were loaded
return False return False
for x in lines:
self.archive.at_insert(x)
return True return True
def check_deprecated(param, option, suggestion): def check_deprecated(param, option, suggestion):