FIX: fix for file parser not properly detecting '35th' (or any number in that format), FIX: fixed some parser problems related to having an '#' in the filename, FIX: made sure to remove any text including covers from the filename during parsing, FIX: when filename had 'Vol. 2017' would not parse the volume properly

This commit is contained in:
evilhero 2019-04-05 12:39:59 -04:00
parent adf609145d
commit 3063b3adf1
1 changed files with 26 additions and 9 deletions

View File

@ -329,7 +329,8 @@ class FileChecker(object):
ret_sf2 = ' '.join(split_file3)
sf = re.findall('''\( [^\)]* \) |\[ [^\]]* \] |\S+''', ret_sf2, re.VERBOSE)
sf = re.findall('''\( [^\)]* \) |\[ [^\]]* \] |\[ [^\#]* \]|\S+''', ret_sf2, re.VERBOSE)
#sf = re.findall('''\( [^\)]* \) |\[ [^\]]* \] |\S+''', ret_sf2, re.VERBOSE)
ret_sf1 = ' '.join(sf)
@ -342,9 +343,8 @@ class FileChecker(object):
ret_sf1 = re.sub('\&', 'f11', ret_sf1).strip()
ret_sf1 = re.sub('\'', 'g11', ret_sf1).strip()
#split_file = re.findall('\([\w\s-]+\)|[-+]?\d*\.\d+|\d+|[\w-]+|#?\d\.\d+|#(?<![\w\d])XCV(?![\w\d])+|\)', ret_sf1, re.UNICODE)
split_file = re.findall('(?imu)\([\w\s-]+\)|[-+]?\d*\.\d+|\d+|[\w-]+|#?\d\.\d+|#(?<![\w\d])XCV(?![\w\d])+|\)', ret_sf1, re.UNICODE)
#split_file = re.findall('(?imu)\([\w\s-]+\)|[-+]?\d*\.\d+|\d+|[\w-]+|#?\d\.\d+|#(?<![\w\d])XCV(?![\w\d])+|\)', ret_sf1, re.UNICODE)
split_file = re.findall('(?imu)\([\w\s-]+\)|[-+]?\d*\.\d+|\d+[\s]COVERS+|\d{4}-\d{2}-\d{2}|\d+[(th|nd|rd|st)]+|\d+|[\w-]+|#?\d\.\d+|#[\.-]\w+|#[\d*\.\d+|\w+\d+]+|#(?<![\w\d])XCV(?![\w\d])+|#[\w+]|\)', ret_sf1, re.UNICODE)
#10-20-2018 ---START -- attempt to detect '01 (of 7.3)'
#10-20-2018 -- attempt to detect '36p ctc' as one element
spf = []
@ -507,6 +507,12 @@ class FileChecker(object):
logger.fdebug('Issue Number SHOULD BE: ' + str(lastissue_label))
validcountchk = True
match2 = re.search('(\d+[\s])covers', sf, re.IGNORECASE)
if match2:
num_covers = re.sub('[^0-9]', '', match2.group()).strip()
#logger.fdebug('%s covers detected within filename' % num_covers)
continue
if all([lastissue_position == (split_file.index(sf) -1), lastissue_label is not None, '#' not in sf, sf != 'p']):
#find it in the original file to see if there's a decimal between.
findst = lastissue_mod_position+1
@ -596,6 +602,16 @@ class FileChecker(object):
try:
volume_found['position'] = split_file.index(volumeprior_label, current_pos -1) #if this passes, then we're ok, otherwise will try exception
logger.fdebug('volume_found: ' + str(volume_found['position']))
#remove volume numeric from split_file
split_file.pop(volume_found['position'])
split_file.pop(split_file.index(sf, current_pos-1))
#join the previous label to the volume numeric
#volume = str(volumeprior_label) + str(volume)
#insert the combined info back
split_file.insert(volume_found['position'], volumeprior_label + volume)
split_file.insert(volume_found['position']+1, '')
#volume_found['position'] = split_file.index(sf, current_pos)
#logger.fdebug('NEWSPLITFILE: %s' % split_file)
except:
volumeprior = False
volumeprior_label = None
@ -608,7 +624,7 @@ class FileChecker(object):
logger.fdebug('volume label detected as : Volume ' + str(volume) + ' @ position: ' + str(split_file.index(sf)))
volumeprior = False
volumeprior_label = None
elif 'vol' in sf.lower() and len(sf) == 3:
elif all(['vol' in sf.lower(), len(sf) == 3]) or all(['vol.' in sf.lower(), len(sf) == 4]):
#if there's a space between the vol and # - adjust.
volumeprior = True
volumeprior_label = sf
@ -918,10 +934,11 @@ class FileChecker(object):
if split_file[issue_number_position -1].lower() == 'annual' or split_file[issue_number_position -1].lower() == 'special':
highest_series_pos = issue_number_position
else:
if volume_found['position'] < issue_number_position:
highest_series_pos = issue_number_position - 1
else:
highest_series_pos = issue_number_position
highest_series_pos = issue_number_position - 1
#if volume_found['position'] < issue_number_position:
# highest_series_pos = issue_number_position - 1
#else:
# highest_series_pos = issue_number_position
#make sure if we have multiple years detected, that the right one gets picked for the actual year vs. series title
if len(possible_years) > 1: