#/usr/bin/env python # This file is part of Mylar. # # Mylar is free software: you can redistribute it and/or modify # it under the terms of the GNU General Public License as published by # the Free Software Foundation, either version 3 of the License, or # (at your option) any later version. # # Mylar is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # GNU General Public License for more details. # # You should have received a copy of the GNU General Public License # along with Mylar. If not, see . import os import os.path import zlib import pprint import subprocess import re #import logger import mylar from mylar import logger, helpers import unicodedata import sys import platform def file2comicmatch(watchmatch): #print ("match: " + str(watchmatch)) pass def listFiles(dir, watchcomic, Publisher, AlternateSearch=None, manual=None, sarc=None): # use AlternateSearch to check for filenames that follow that naming pattern # ie. Star Trek TNG Doctor Who Assimilation won't get hits as the # checker looks for Star Trek TNG Doctor Who Assimilation2 (according to CV) # we need to convert to ascii, as watchcomic is utf-8 and special chars f'it up u_watchcomic = unicodedata.normalize('NFKD', watchcomic).encode('ASCII', 'ignore') #watchcomic.encode('ascii', 'ignore').strip() logger.fdebug('[FILECHECKER] comic: ' + u_watchcomic) basedir = dir logger.fdebug('[FILECHECKER] Looking in: ' + dir) watchmatch = {} comiclist = [] comiccnt = 0 not_these = ['#', ',', '\/', ':', '\;', '.', '-', '!', '\$', '\%', '\+', '\'', '\?', '\@'] issue_exceptions = ['AU', '.INH', '.NOW', 'AI', 'A', 'B', 'C', 'X', 'O'] extensions = ('.cbr', '.cbz', '.cb7') # #get the entire tree here dirlist = traverse_directories(basedir) # for item in os.listdir(basedir): for fname in dirlist: moddir = None # at a later point, we should store the basedir and scan it in for additional info, since some users # have their structure setup as 'Batman v2 (2011)/Batman #1.cbz' or 'Batman/V2-(2011)/Batman #1.cbz' if fname['directory'] == '': basedir = dir else: basedir = fname['directory'] #if it's a subdir, strip out the main dir and retain the remainder for the filechecker to find it. #start at position 1 so the initial slash is removed since it's a sub, and os.path.join will choke. moddir = basedir.replace(dir, '')[1:].rstrip() item = fname['filename'] #for mac OS metadata ignoring. if item.startswith('._'): logger.info('ignoring os metadata for ' + item) continue if item == 'cover.jpg' or item == 'cvinfo': continue if not item.lower().endswith(extensions): #logger.fdebug('[FILECHECKER] filename not a valid cbr/cbz - ignoring: ' + item) continue #print item #subname = os.path.join(basedir, item) subname = item subname = re.sub('\_', ' ', subname) #Remove html code for ( ) subname = re.sub(r'%28', '(', subname) subname = re.sub(r'%29', ')', subname) #versioning - remove it subsplit = subname.replace('_', ' ').split() volrem = None vers4year = "no" vers4vol = "no" digitchk = 0 if sarc and mylar.READ2FILENAME: logger.fdebug('[SARC] subname: ' + subname) removest = subname.find('-') # the - gets removed above so we test for the first blank space... logger.fdebug('[SARC] Checking filename for Reading Order sequence - removest: ' + str(removest)) logger.fdebug('removestdig: ' + subname[:removest -1]) if subname[:removest].isdigit() and removest == 3: subname = subname[4:] logger.fdebug('[SARC] Removed Reading Order sequence from subname. Now set to : ' + subname) for subit in subsplit: if subit[0].lower() == 'v': vfull = 0 if subit[1:].isdigit(): #if in format v1, v2009 etc... if len(subit[1:]) == 4: #v2013 # if it's greater than 3 in length, then the format is Vyyyy logger.fdebug('[FILECHECKER] Version detected as : ' + str(subit)) vers4year = "yes" else: if len(subit) < 4: logger.fdebug('[FILECHECKER] Version detected as : ' + str(subit)) vers4vol = str(subit) subname = re.sub(subit, '', subname) volrem = subit vers4vol = volrem break elif subit.lower()[:3] == 'vol': tsubit = re.sub('vol', '', subit.lower()) try: if any([tsubit.isdigit(), len(tsubit) > 5]): #if in format vol.2013 etc #because the '.' in Vol. gets removed, let's loop thru again after the Vol hit to remove it entirely logger.fdebug('[FILECHECKER] volume indicator detected as version #:' + str(subit)) subname = re.sub(subit, '', subname) volrem = subit vers4year = "yes" except: continue #check if a year is present in series title (ie. spider-man 2099) #also check if decimal present in series title (ie. batman beyond 2.0) #- check if brackets present in series title numberinseries = 'False' decimalinseries = 'False' bracketsinseries = 'False' for i in watchcomic.split(): if i.isdigit(): numberinseries = 'True' if ('20' in i or '19' in i): if i.isdigit(): numberinseries = 'True' else: find20 = i.find('20') if find20: stf = i[find20:4].strip() find19 = i.find('19') if find19: stf = i[find19:4].strip() logger.fdebug('[FILECHECKER] stf is : ' + str(stf)) if stf.isdigit(): numberinseries = 'True' if ('.' in i): try: float(i) decimalinseries = 'True' std = i logger.fdebug('[FILECHECKER] std is : ' + str(std)) except: pass #logger.fdebug('[FILECHECKER] i : ' + str(i)) if ('(' in i): bracketsinseries = 'True' bracket_length_st = watchcomic.find('(') bracket_length_en = watchcomic.find(')', bracket_length_st) bracket_length = bracket_length_en - bracket_length_st bracket_word = watchcomic[bracket_length_st:bracket_length_en +1] logger.fdebug('[FILECHECKER] bracketinseries: ' + str(bracket_word)) logger.fdebug('[FILECHECKER] numberinseries: ' + str(numberinseries)) logger.fdebug('[FILECHECKER] decimalinseries: ' + str(decimalinseries)) logger.fdebug('[FILECHECKER] bracketinseries: ' + str(bracketsinseries)) #iniitate the alternate list here so we can add in the different flavours based on above AS_Alt = [] #remove the brackets.. if bracketsinseries == 'True': logger.fdebug('[FILECHECKER] modifying subname to accomodate brackets within series title.') #subnm_mod2 = re.findall('[^()]+', subname[bracket_length_en:]) #logger.fdebug('[FILECHECKER] subnm_mod : ' + str(subnm_mod2)) #subnm_mod = re.sub('[\(\)]',' ', subname[:bracket_length_st]) + str(subname[bracket_length_en:]) #logger.fdebug('[FILECHECKER] subnm_mod_st: ' + str(subname[:bracket_length_st])) #logger.fdebug('[FILECHECKER] subnm_mod_en: ' + str(subname[bracket_length_en:])) #logger.fdebug('[FILECHECKER] modified subname is now : ' + str(subnm_mod)) if bracket_word in subname: nobrackets_word = re.sub('[\(\)]', '', bracket_word).strip() subname = re.sub(nobrackets_word, '', subname).strip() subnm = re.findall('[^()]+', subname) logger.fdebug('[FILECHECKER] subnm len : ' + str(len(subnm))) if len(subnm) == 1: logger.fdebug('[FILECHECKER] ' + str(len(subnm)) + ': detected invalid filename - attempting to detect year to continue') #if the series has digits this f's it up. if numberinseries == 'True' or decimalinseries == 'True': #we need to remove the series from the subname and then search the remainder. watchname = re.sub('[\:\;\!\'\/\?\+\=\_\%\.\-]', '', watchcomic) #remove spec chars for watchcomic match. logger.fdebug('[FILECHECKER] watch-cleaned: ' + watchname) subthis = re.sub('.cbr', '', subname) subthis = re.sub('.cbz', '', subthis) subthis = re.sub('[\:\;\!\'\/\?\+\=\_\%\.\-]', '', subthis) subthis = re.sub('\s+', ' ', subthis) logger.fdebug('[FILECHECKER] sub-cleaned: ' + subthis) #we need to make sure the file is part of the correct series or else will match falsely if watchname.lower() not in subthis.lower(): logger.fdebug('[FILECHECKER] ' + watchname + ' this is a false match to ' + subthis + ' - Ignoring this result.') continue ogsubthis = subthis subthis = subthis[len(watchname):] #remove watchcomic #we need to now check the remainder of the string for digits assuming it's a possible year logger.fdebug('[FILECHECKER] new subname: ' + subthis) if subthis.startswith('('): # if it startswith a bracket, then it's probably a year - let's check. for i in subthis.split(): tmpi = re.sub('[\(\)]', '', i).strip() if tmpi.isdigit(): if (tmpi.startswith('19') or tmpi.startswith('20')) and len(tmpi) == 4: logger.fdebug('[FILECHECKER] year detected: ' + str(tmpi)) subname = re.sub('(19\d{2}|20\d{2})(.*)', '\\2 (\\1)', subthis) subname = re.sub('\(\)', '', subname).strip() subname = watchcomic + ' ' + subname logger.fdebug('[FILECHECKER] new subname reversed: ' + subname) break else: year = None for i in subthis.split(): if ('20' in i or '19' in i): if i.isdigit(): year = i[:4] else: findyr20 = i.find('20') if findyr20: styear = i[findyr20:4].strip() findyr19 = i.find('19') if findyr19: styear = i[findyr19:4].strip() if styear.isdigit() and len(styear) == 4: year = styear logger.fdebug('[FILECHECKER] stf is : ' + str(styear)) if year: subname = re.sub('(.*)[\s+|_+](19\d{2}|20\d{2})(.*)', '\\1 \\2 (\\3)', subthis) else: #unable to find year in filename logger.fdebug('[FILECHECKER] Unable to detect year within filename. Continuing as is and assuming this is a volume 1 and will work itself out later.') subname = ogsubthis subnm = re.findall('[^()]+', subname) else: subit = re.sub('(.*)[\s+|_+](19\d{2}|20\d{2})(.*)', '\\1 \\3 (\\2)', subname).replace('( )', '') subthis2 = re.sub('.cbr', '', subit) subthis1 = re.sub('.cbz', '', subthis2) subname = re.sub('[\:\;\!\'\/\?\+\=\_\%]', '', subthis1) #if '.' appears more than once at this point, then it's being used in place of spaces. #if '.' only appears once at this point, it's a decimal issue (since decimalinseries is False within this else stmt). if len(str(subname.count('.'))) == 1: logger.fdebug('[FILECHECKER] decimal issue detected, not removing decimals') else: logger.fdebug('[FILECHECKER] more than one decimal detected, and the series does not have decimals - assuming in place of spaces.') subname = re.sub('[\.]', '', subname) subnm = re.findall('[^()]+', subname) else: if numberinseries == 'True' or decimalinseries == 'True': #we need to remove the series from the subname and then search the remainder. subthis = re.sub('.cbr', '', subname) subthis = re.sub('.cbz', '', subthis) if decimalinseries == 'True': watchname = re.sub('[\:\;\!\'\/\?\+\=\_\%\-]', '', watchcomic) #remove spec chars for watchcomic match. subthis = re.sub('[\:\;\!\'\/\?\+\=\_\%\-]', '', subthis) else: # in order to get series like Earth 2 scanned in that contain a decimal, I removed the \. from the re.subs below - 28-08-2014 watchname = re.sub('[\:\;\!\'\/\?\+\=\_\%\-]', '', watchcomic) #remove spec chars for watchcomic match. subthis = re.sub('[\:\;\!\'\/\?\+\=\_\%\-]', '', subthis) logger.fdebug('[FILECHECKER] watch-cleaned: ' + watchname) subthis = re.sub('\s+', ' ', subthis) logger.fdebug('[FILECHECKER] sub-cleaned: ' + subthis) #we need to make sure the file is part of the correct series or else will match falsely if watchname.lower() not in subthis.lower(): logger.fdebug('[FILECHECKER] ' + watchname + ' this is a false match to ' + subthis + ' - Ignoring this result.') continue subthis = subthis[len(watchname):].strip() #remove watchcomic #we need to now check the remainder of the string for digits assuming it's a possible year logger.fdebug('[FILECHECKER] new subname: ' + subthis) if subthis.startswith('('): # if it startswith a bracket, then it's probably a year and the format is incorrect to continue - let's check. for i in subthis.split(): tmpi = re.sub('[\(\)]', '', i).strip() if tmpi.isdigit(): if (tmpi.startswith('19') or tmpi.startswith('20')) and len(tmpi) == 4: logger.fdebug('[FILECHECKER] Year detected: ' + str(tmpi)) subname = re.sub('(19\d{2}|20\d{2})(.*)', '\\2 (\\1)', subthis) subname = re.sub('\(\)', '', subname).strip() logger.fdebug('[FILECHECKER] Flipping the issue with the year: ' + subname) break else: numcheck = re.findall('[19\d{2}|20\d{2}]', subthis) if len(numcheck) == 1: subname = re.sub('(19\d{2}|20\d{2})(.*)', '\\2 (\\1)', subthis) else: subname = re.sub('(19\d{2}|20\d{2})(.*)', '\\1 (\\2)', subthis) subname = re.sub('\(\)', '', subname).strip() subname = watchname + ' ' + subname subname = re.sub('\s+', ' ', subname).strip() logger.fdebug('[FILECHECKER] New subname reversed: ' + subname) subnm = re.findall('[^()]+', subname) subsplit = subname.replace('_', ' ').split() if sarc is None: if Publisher.lower() in re.sub('_', ' ', subname.lower()): #if the Publisher is given within the title or filename even (for some reason, some people #have this to distinguish different titles), let's remove it entirely. lenm = len(subnm) cnt = 0 pub_removed = None while (cnt < lenm): submod = re.sub('_', ' ', subnm[cnt]) if submod is None: break if submod == ' ': pass else: logger.fdebug('[FILECHECKER] ' + str(cnt) + ". Bracket Word: " + submod) if Publisher.lower() in submod.lower() and cnt >= 1: logger.fdebug('[FILECHECKER] Publisher detected within title : ' + submod) logger.fdebug('[FILECHECKER] cnt is : ' + str(cnt) + ' --- Publisher is: ' + Publisher) #-strip publisher if exists here- pub_removed = submod logger.fdebug('[FILECHECKER] removing publisher from title') subname_pubremoved = re.sub(pub_removed, '', subname) logger.fdebug('[FILECHECKER] pubremoved : ' + subname_pubremoved) subname_pubremoved = re.sub('\(\)', '', subname_pubremoved) #remove empty brackets subname_pubremoved = re.sub('\s+', ' ', subname_pubremoved) #remove spaces > 1 logger.fdebug('[FILECHECKER] blank brackets removed: ' + subname_pubremoved) subnm = re.findall('[^()]+', subname_pubremoved) break cnt+=1 #If the Year comes before the Issue # the subname is passed with no Issue number. #This logic checks for numbers before the extension in the format of 1 01 001 #and adds to the subname. (Cases where comic name is $Series_$Year_$Issue) # if len(subnm) > 1: # if (re.search('(19\d{2}|20\d{2})',subnm[1]) is not None): # logger.info('subnm[1]: ' + str(subnm[1])) # for i in subnm: # tmpi = i.strip() # if tmpi.isdigit(): # if (tmpi.startswith('19') or tmpi.startswith('20')) and len(tmpi) == 4: # logger.info('[FILECHECKER] year detected: ' + str(tmpi)) # #strip out all the brackets in the subnm[2] if it exists so we're left with just the issue # in most cases # subremoved = re.findall('[^()]+', subnm[2]).strip() # if len(subremoved) > 5: # logger.info('[FILECHECKER] something is wrong with the parsing - better report the issue on github.') # break # subname = re.sub('(.*)[\s+|_+](19\d{2}|20\d{2})(.*)', '\\1 ' + str(subremoved) + ' (\\2)', subname) # subname = re.sub('\(\)', '', subname).strip() # logger.info('[FILECHECKER] THE new subname reversed: ' + str(subname)) # break # else: # subname = re.sub('(.*)[\s+|_+](19\d{2}|20\d{2})(.*)', '\\1 \\2 (\\3)', subname) # subnm = re.findall('[^()]+', subname) # we need to regenerate this here. # logger.fdebug('[FILECHECKER] subnm0: ' + str(subnm[0])) # logger.fdebug('[FILECHECKER] subnm1: ' + str(subnm[1])) # logger.fdebug('subnm2: ' + str(subnm[2])) # subname = str(subnm[0]).lstrip() + ' (' + str(subnm[1]).strip() + ') ' subname = subnm[0] if len(subnm) == 1: # if it still has no year (brackets), check setting and either assume no year needed. subname = subname logger.fdebug('[FILECHECKER] subname no brackets: ' + subname) nonocount = 0 charpos = 0 detneg = "no" leavehyphen = False should_restart = True lenwatch = len(watchcomic) # because subname gets replaced dynamically, the length will change and things go wrong. while should_restart: should_restart = False for nono in not_these: if nono in subname: subcnt = subname.count(nono) charpos = indices(subname, nono) # will return a list of char positions in subname logger.fdebug("[" + str(nono) + "] charpos: " + str(charpos)) if nono == '-': i=0 while (i < len(charpos)): for i, j in enumerate(charpos): if j +2 > len(subname): sublimit = subname[j +1:] else: sublimit = subname[j +1:j +2] if sublimit.isdigit(): logger.fdebug('[FILECHECKER] possible negative issue detected.') nonocount = nonocount + subcnt - 1 detneg = "yes" elif '-' in watchcomic and j < lenwatch: lenwatch -=1 logger.fdebug('[FILECHECKER] - appears in series title.') logger.fdebug('[FILECHECKER] up to - :' + subname[:j +1].replace('-', ' ')) logger.fdebug('[FILECHECKER] after - :' + subname[j +1:]) subname = subname[:j +1].replace('-', '') + subname[j +1:] logger.fdebug('[FILECHECKER] new subname is : ' + subname) should_restart = True leavehyphen = True i+=1 if detneg == "no" and leavehyphen == False: subname = re.sub(str(nono), ' ', subname) nonocount = nonocount + subcnt #logger.fdebug('[FILECHECKER] (str(nono) + " detected " + str(subcnt) + " times.") # segment '.' having a . by itself will denote the entire string which we don't want elif nono == '.': logger.fdebug('[FILECHECKER] Decimal check.') x = 0 fndit = 0 dcspace = 0 while (x < len(charpos)): for x, j in enumerate(charpos): fndit = j logger.fdebug('fndit: ' + str(fndit)) logger.fdebug('isdigit1: ' + subname[fndit -1:fndit]) logger.fdebug('isdigit2: ' + subname[fndit +1:fndit +2]) if subname[fndit -1:fndit].isdigit() and subname[fndit +1:fndit +2].isdigit(): logger.fdebug('[FILECHECKER] decimal issue detected.') dcspace+=1 else: subname = subname[:fndit] + ' ' + subname[fndit +1:] nonocount+=1 x+=1 nonocount += (subcnt + dcspace) #if dcspace == 1: # nonocount = nonocount + subcnt + dcspace #else: # subname = re.sub('\.', ' ', subname) # nonocount = nonocount + subcnt - 1 #(remove the extension from the length) else: #this is new - if it's a symbol seperated by a space on each side it drags in an extra char. x = 0 fndit = 0 blspc = 0 if nono == '#': fndit = subname.find(nono) if subname[fndit +1].isdigit(): subname = re.sub('#', '', subname) continue while x < subcnt: fndit = subname.find(nono, fndit) #print ("space before check: " + str(subname[fndit-1:fndit])) #print ("space after check: " + str(subname[fndit+1:fndit+2])) if subname[fndit -1:fndit] == ' ' and subname[fndit +1:fndit +2] == ' ': logger.fdebug('[FILECHECKER] blankspace detected before and after ' + str(nono)) blspc+=1 x+=1 logger.fdebug('[FILECHECKER] replacing ' + str(nono) + ' with a space') subname = re.sub(str(nono), '', subname) nonocount = nonocount + subcnt + blspc #subname = re.sub('[\_\#\,\/\:\;\.\-\!\$\%\+\'\?\@]',' ', subname) if decimalinseries == 'True': modwatchcomic = re.sub('[\_\#\,\/\:\;\!\$\%\?\@]', ' ', u_watchcomic) else: modwatchcomic = re.sub('[\_\#\,\/\:\;\.\!\$\%\?\@]', ' ', u_watchcomic) if bracketsinseries == 'True': modwatchcomic = re.sub('[\(\)]', ' ', modwatchcomic) modwatchcomic = re.sub('[\-\']', '', modwatchcomic) #trying this too - 2014-03-01 #if leavehyphen == False: # logger.fdebug('[FILECHECKER] ('removing hyphen for comparisons') # modwatchcomic = re.sub('-', ' ', modwatchcomic) # subname = re.sub('-', ' ', subname) detectand = False detectthe_mod = False detectthe_sub = False modwatchcomic = re.sub('\&', ' and ', modwatchcomic) if ' the ' in modwatchcomic.lower() or modwatchcomic.lower().startswith('the '): modwatchcomic = re.sub("\\bthe\\b", "", modwatchcomic.lower()) logger.fdebug('[FILECHECKER] new modwatchcomic: ' + modwatchcomic) detectthe_mod = True modwatchcomic = re.sub('\s+', ' ', modwatchcomic).strip() if '&' in subname: logger.fdebug('[FILECHECKER] detected & in subname') subname = re.sub('\&', ' and ', subname) detectand = True if ' the ' in subname.lower() or subname.lower().startswith('the '): subname = re.sub("\\bthe\\b", "", subname.lower()) detectthe_sub = True subname = re.sub('\s+', ' ', subname).strip() #AS_Alt = [] AS_Tuple = [] if AlternateSearch is not None: chkthealt = AlternateSearch.split('##') if chkthealt == 0: AS_Alternate = AlternateSearch for calt in chkthealt: AS_tupled = False AS_Alternate = re.sub('##', '', calt) if '!!' in AS_Alternate: # if it's !! present, it's the comicid associated with the series as an added annual. # extract the !!, store it and then remove it so things will continue. as_start = AS_Alternate.find('!!') logger.fdebug('as_start: ' + str(as_start) + ' --- ' + str(AS_Alternate[as_start:])) as_end = AS_Alternate.find('##', as_start) if as_end == -1: as_end = len(AS_Alternate) logger.fdebug('as_start: ' + str(as_end) + ' --- ' + str(AS_Alternate[as_start:as_end])) AS_ComicID = AS_Alternate[as_start +2:as_end] logger.fdebug('[FILECHECKER] Extracted comicid for given annual : ' + str(AS_ComicID)) AS_Alternate = re.sub('!!' + str(AS_ComicID), '', AS_Alternate) AS_tupled = True #same = encode. u_altsearchcomic = AS_Alternate.encode('ascii', 'ignore').strip() altsearchcomic = re.sub('[\_\#\,\/\:\;\.\!\$\%\+\?\@]', ' ', u_altsearchcomic) altsearchcomic = re.sub('[\-\']', '', altsearchcomic) #because this is a watchcomic registered, use same algorithim for watchcomic altsearchcomic = re.sub('\&', ' and ', altsearchcomic) if detectthe_sub == True: altsearchcomic = re.sub("\\bthe\\b", "", altsearchcomic.lower()) altsearchcomic = re.sub('\s+', ' ', str(altsearchcomic)).strip() if AS_tupled: AS_Tuple.append({"ComicID": AS_ComicID, "AS_Alternate": altsearchcomic}) AS_Alt.append(altsearchcomic) else: #create random characters so it will never match. altsearchcomic = "127372873872871091383 abdkhjhskjhkjdhakajhf" AS_Alt.append(altsearchcomic) #if '_' in subname: # subname = subname.replace('_', ' ') logger.fdebug('[FILECHECKER] AS_Alt : ' + str(AS_Alt)) logger.fdebug('[FILECHECKER] watchcomic:' + modwatchcomic + ' ..comparing to found file: ' + subname) if modwatchcomic.lower() in subname.lower() or any(x.lower() in subname.lower() for x in AS_Alt): #if the alternate search name is almost identical, it won't match up because it will hit the 'normal' first. #not important for series' matches, but for annuals, etc it is very important. #loop through the Alternates picking out the ones that match and then do an overall loop. enable_annual = False loopchk = [x for x in AS_Alt if x.lower() in subname.lower()] if len(loopchk) > 0 and loopchk[0] != '': logger.fdebug('[FILECHECKER] This should be an alternate: ' + str(loopchk)) if 'annual' in subname.lower(): logger.fdebug('[FILECHECKER] Annual detected - proceeding') enable_annual = True else: loopchk = [] if modwatchcomic.lower() in subname.lower() and enable_annual == False: loopchk.append(modwatchcomic) if 'annual' in subname.lower(): if 'bi annual' in subname.lower(): logger.fdebug('[FILECHECKER] BiAnnual detected - wouldn\'t Deadpool be proud?') subname = re.sub('Bi Annual', 'BiAnnual', subname) jtd_len = subname.lower().find('bi annual') enable_annual = True else: logger.fdebug('[FILECHECKER] Annual detected - proceeding cautiously.') jtd_len = subname.lower().find('annual') enable_annual = False logger.fdebug('[FILECHECKER] Complete matching list of names to this file [' + str(len(loopchk)) + '] : ' + str(loopchk)) for loopit in loopchk: modwatchcomic = loopit logger.fdebug('[FILECHECKER] AS_Tuple : ' + str(AS_Tuple)) annual_comicid = None for ATS in AS_Tuple: logger.fdebug('[FILECHECKER] ' + str(ATS['AS_Alternate']) + ' comparing to ' + subname[:len(ATS['AS_Alternate'])]) #str(modwatchcomic)) if ATS['AS_Alternate'].lower().strip() == subname[:len(ATS['AS_Alternate'])].lower().strip(): #modwatchcomic logger.fdebug('[FILECHECKER] Associating ComiciD : ' + str(ATS['ComicID'])) annual_comicid = str(ATS['ComicID']) modwatchcomic = ATS['AS_Alternate'] break comicpath = os.path.join(basedir, item) logger.fdebug('[FILECHECKER] ' + modwatchcomic + ' - watchlist match on : ' + comicpath) comicsize = os.path.getsize(comicpath) #print ("Comicsize:" + str(comicsize)) comiccnt+=1 stann = 0 cchk = modwatchcomic #else: #if modwatchcomic.lower() in subname.lower(): # cchk = modwatchcomic #else: # cchk_ls = [x for x in AS_Alt if x.lower() in subname.lower()] # cchk = cchk_ls[0] logger.fdebug('[FILECHECKER] cchk is : ' + str(cchk)) logger.fdebug('[FILECHECKER] we should remove ' + str(nonocount) + ' characters') findtitlepos = subname.find('-') if charpos != 0: logger.fdebug('[FILECHECKER] detected ' + str(len(charpos)) + ' special characters') for i, j in enumerate(charpos): logger.fdebug('i,j:' + str(i) + ',' + str(j)) logger.fdebug(str(len(subname)) + ' - subname: ' + subname) logger.fdebug("digitchk: " + subname[j -1:]) if j >= len(subname): logger.fdebug('[FILECHECKER] ' + str(j) + ' is >= ' + str(len(subname)) + ' .End reached. ignoring remainder.') break elif subname[j:] == '-': try: if j <= len(subname) and subname[j +1].isdigit(): logger.fdebug('[FILECHECKER] negative issue detected.') #detneg = "yes" except IndexError: logger.fdebug('[FILECHECKER] There was a problem parsing the information from this filename: ' + comicpath) elif j > findtitlepos: if subname[j:] == '#': if subname[j +1].isdigit(): logger.fdebug('[FILECHECKER] # detected denoting issue#, ignoring.') else: nonocount-=1 elif ('-' in watchcomic or '.' in watchcomic) and j < len(watchcomic): logger.fdebug('[FILECHECKER] - appears in series title, ignoring.') else: digitchk = re.sub('#', '', subname[j -1:]).strip() logger.fdebug('[FILECHECKER] special character appears outside of title - ignoring @ position: ' + str(charpos[i])) nonocount-=1 #remove versioning here if volrem != None: jtd_len = len(cchk)# + len(volrem)# + nonocount + 1 #1 is to account for space btwn comic and vol # else: jtd_len = len(cchk)# + nonocount # if sarc and mylar.READ2FILENAME: # removest = subname.find(' ') # the - gets removed above so we test for the first blank space... # if subname[:removest].isdigit(): # jtd_len += removest + 1 # +1 to account for space in place of - # logger.fdebug('[FILECHECKER] adjusted jtd_len to : ' + str(removest) + ' because of story-arc reading order tags') logger.fdebug('[FILECHECKER] nonocount [' + str(nonocount) + '] cchk [' + cchk + '] length [' + str(len(cchk)) + ']') #if detectand: # jtd_len = jtd_len - 2 # char substitution diff between & and 'and' = 2 chars #if detectthe_mod == True and detectthe_sub == False: #jtd_len = jtd_len - 3 # char subsitiution diff between 'the' and '' = 3 chars #justthedigits = item[jtd_len:] logger.fdebug('[FILECHECKER] final jtd_len to prune [' + str(jtd_len) + ']') logger.fdebug('[FILECHECKER] before title removed from FILENAME [' + str(item) + ']') logger.fdebug('[FILECHECKER] after title removed from FILENAME [' + str(item[jtd_len:]) + ']') logger.fdebug('[FILECHECKER] creating just the digits using SUBNAME, pruning first [' + str(jtd_len) + '] chars from [' + subname + ']') justthedigits_1 = re.sub('#', '', subname[jtd_len:]).strip() if enable_annual: logger.fdebug('enable annual is on') if annual_comicid is not None: logger.fdebug('annual comicid is ' + str(annual_comicid)) if 'biannual' in modwatchcomic.lower(): logger.fdebug('bi annual detected') justthedigits_1 = 'BiAnnual ' + justthedigits_1 else: logger.fdebug('annual detected') justthedigits_1 = 'Annual ' + justthedigits_1 logger.fdebug('[FILECHECKER] after title removed from SUBNAME [' + justthedigits_1 + ']') titlechk = False if digitchk: try: #do the issue title check here logger.fdebug('[FILECHECKER] Possible issue title is : ' + str(digitchk)) # see if it can float the digits try: st = digitchk.find('.') logger.fdebug('st:' + str(st)) st_d = digitchk[:st] logger.fdebug('st_d:' + str(st_d)) st_e = digitchk[st +1:] logger.fdebug('st_e:' + str(st_e)) #x = int(float(st_d)) #logger.fdebug('x:' + str(x)) #validity check if helpers.is_number(st_d): #x2 = int(float(st_e)) if helpers.is_number(st_e): logger.fdebug('[FILECHECKER] This is a decimal issue.') else: raise ValueError else: raise ValueError except ValueError, e: if digitchk.startswith('.'): pass else: if len(justthedigits_1) >= len(digitchk) and len(digitchk) > 3: logger.fdebug('[FILECHECKER] Removing issue title.') justthedigits_1 = re.sub(digitchk, '', justthedigits_1).strip() logger.fdebug('[FILECHECKER] After issue title removed [' + justthedigits_1 + ']') titlechk = True hyphensplit = digitchk issue_firstword = digitchk.split()[0] splitit = subname.split() splitst = len(splitit) logger.fdebug('[FILECHECKER] splitit :' + str(splitit)) logger.fdebug('[FILECHECKER] splitst :' + str(len(splitit))) orignzb = item except: #test this out for manual post-processing items like original sin 003.3 - thor and loki 002... #*************************************************************************************** # need to assign digitchk here for issues that don't have a title and fail the above try. #*************************************************************************************** try: logger.fdebug('[FILECHECKER] justthedigits_1 len : ' + str(len(justthedigits_1))) logger.fdebug('[FILECHECKER] digitchk len : ' + str(len(digitchk))) if len(justthedigits_1) >= len(digitchk) and len(digitchk) > 3: logger.fdebug('[FILECHECKER] Removing issue title.') justthedigits_1 = re.sub(digitchk, '', justthedigits_1).strip() logger.fdebug('[FILECHECKER] After issue title removed [' + justthedigits_1 + ']') titlechk = True hyphensplit = digitchk issue_firstword = digitchk.split()[0] splitit = subname.split() splitst = len(splitit) logger.info('[FILECHECKER] splitit :' + str(splitit)) logger.info('[FILECHECKER] splitst :' + str(len(splitit))) orignzb = item except: pass #(revert this back if above except doesn't work) #remove the title if it appears #findtitle = justthedigits.find('-') #if findtitle > 0 and detneg == "no": # justthedigits = justthedigits[:findtitle] # logger.fdebug('[FILECHECKER] ("removed title from name - is now : " + str(justthedigits)) justthedigits = justthedigits_1.split(' ', 1)[0] digitsvalid = "false" if not justthedigits.isdigit() and 'annual' not in justthedigits.lower(): logger.fdebug('[FILECHECKER] Invalid character found in filename after item removal - cannot find issue # with this present. Temporarily removing it from the comparison to be able to proceed.') try: justthedigits = justthedigits_1.split(' ', 1)[1] if justthedigits.isdigit(): digitsvalid = "true" except: pass if digitsvalid == "false": if 'annual' not in justthedigits.lower(): for jdc in list(justthedigits): if not jdc.isdigit(): jdc_start = justthedigits.find(jdc) alpha_isschk = justthedigits[jdc_start:] for issexcept in issue_exceptions: if issexcept.lower() in alpha_isschk.lower() and len(alpha_isschk) <= len(issexcept): logger.fdebug('[FILECHECKER] ALPHANUMERIC EXCEPTION : [' + justthedigits + ']') digitsvalid = "true" break if digitsvalid == "true": break try: tmpthedigits = justthedigits_1.split(' ', 1)[1] logger.fdebug('[FILECHECKER] If the series has a decimal, this should be a number [' + tmpthedigits + ']') if 'cbr' in tmpthedigits.lower() or 'cbz' in tmpthedigits.lower(): tmpthedigits = tmpthedigits[:-3].strip() logger.fdebug('[FILECHECKER] Removed extension - now we should just have a number [' + tmpthedigits + ']') poss_alpha = tmpthedigits if poss_alpha.isdigit(): digitsvalid = "true" if (justthedigits.lower() == 'annual' and 'annual' not in watchcomic.lower()) or (annual_comicid is not None): logger.fdebug('[FILECHECKER] ANNUAL DETECTED [' + poss_alpha + ']') justthedigits += ' ' + poss_alpha else: justthedigits += '.' + poss_alpha logger.fdebug('[FILECHECKER] DECIMAL ISSUE DETECTED [' + justthedigits + ']') else: for issexcept in issue_exceptions: decimalexcept = False if '.' in issexcept: decimalexcept = True issexcept = issexcept[1:] #remove the '.' from comparison... if issexcept.lower() in poss_alpha.lower() and len(poss_alpha) <= len(issexcept): if decimalexcept: issexcept = '.' + issexcept justthedigits += issexcept #poss_alpha logger.fdebug('[FILECHECKER] ALPHANUMERIC EXCEPTION. COMBINING : [' + justthedigits + ']') digitsvalid = "true" break except: tmpthedigits = None # justthedigits = justthedigits.split(' ', 1)[0] #if the issue has an alphanumeric (issue_exceptions, join it and push it through) logger.fdebug('[FILECHECKER] JUSTTHEDIGITS [' + justthedigits + ']') if digitsvalid == "true": pass else: if justthedigits.isdigit(): digitsvalid = "true" else: if '.' in justthedigits: tmpdec = justthedigits.find('.') b4dec = justthedigits[:tmpdec] a4dec = justthedigits[tmpdec +1:] if a4dec.isdigit() and b4dec.isdigit(): logger.fdebug('[FILECHECKER] DECIMAL ISSUE DETECTED') digitsvalid = "true" else: try: x = float(justthedigits) #validity check if x < 0: logger.fdebug("I've encountered a negative issue #: " + str(justthedigits) + ". Trying to accomodate.") digitsvalid = "true" else: raise ValueError except ValueError, e: logger.fdebug('Probably due to an incorrect match - I cannot determine the issue number from given issue #: ' + str(justthedigits)) logger.fdebug('[FILECHECKER] final justthedigits [' + justthedigits + ']') if digitsvalid == "false": logger.fdebug('[FILECHECKER] Issue number not properly detected...ignoring.') comiccnt -=1 # remove the entry from the list count as it was incorrrectly tallied. continue if manual is not None: #this is needed for Manual Run to determine matches #without this Batman will match on Batman Incorporated, and Batman and Robin, etc.. # in case it matches on an Alternate Search pattern, set modwatchcomic to the cchk value modwatchcomic = cchk logger.fdebug('[FILECHECKER] cchk = ' + cchk.lower()) logger.fdebug('[FILECHECKER] modwatchcomic = ' + modwatchcomic.lower()) logger.fdebug('[FILECHECKER] subname = ' + subname.lower()) comyear = manual['SeriesYear'] issuetotal = manual['Total'] comicvolume = manual['ComicVersion'] logger.fdebug('[FILECHECKER] SeriesYear: ' + str(comyear)) logger.fdebug('[FILECHECKER] IssueTotal: ' + str(issuetotal)) logger.fdebug('[FILECHECKER] Comic Volume: ' + str(comicvolume)) logger.fdebug('[FILECHECKER] volume detected: ' + str(volrem)) if comicvolume: ComVersChk = re.sub("[^0-9]", "", comicvolume) if ComVersChk == '' or ComVersChk == '1': ComVersChk = 0 else: ComVersChk = 0 # even if it's a V1, we need to pull the date for the given issue ID and get the publication year # for the issue. Because even if it's a V1, if there are additional Volumes then it's possible that # it will take the incorrect series. (ie. Detective Comics (1937) & Detective Comics (2011). # If issue #28 (2013) is found, it exists in both series, and because DC 1937 is a V1, it will bypass # the year check which will result in the incorrect series being picked (1937) #set the issue/year threshold here. # 2013 - (24issues/12) = 2011. #minyear = int(comyear) - (int(issuetotal) / 12) maxyear = manual['LatestDate'][:4] # yyyy-mm-dd #subnm defined at being of module. len_sm = len(subnm) #print ("there are " + str(lenm) + " words.") cnt = 0 yearmatch = "none" #logger.fdebug('[FILECHECKER] subsplit : ' + subsplit) versionmatch = "false" if vers4year is not "no" or vers4vol is not "no": if comicvolume: D_ComicVersion = re.sub("[^0-9]", "", comicvolume) if D_ComicVersion == '': D_ComicVersion = 0 else: D_ComicVersion = 0 F_ComicVersion = re.sub("[^0-9]", "", volrem) S_ComicVersion = str(comyear) logger.fdebug('[FILECHECKER] FCVersion: ' + str(F_ComicVersion)) logger.fdebug('[FILECHECKER] DCVersion: ' + str(D_ComicVersion)) logger.fdebug('[FILECHECKER] SCVersion: ' + str(S_ComicVersion)) #if annualize == "true" and int(ComicYear) == int(F_ComicVersion): # logger.fdebug('[FILECHECKER] ("We matched on versions for annuals " + str(volrem)) try: if int(F_ComicVersion) == int(D_ComicVersion) or int(F_ComicVersion) == int(S_ComicVersion): logger.fdebug('[FILECHECKER] We matched on versions...' + str(volrem)) versionmatch = "true" yearmatch = "false" else: logger.fdebug('[FILECHECKER] Versions wrong. Ignoring possible match.') except ValueError: logger.warning('[FILECHECKER] Unable to determine version number. This issue will be skipped.') result_comyear = None while (cnt < len_sm): if subnm[cnt] is None: break if subnm[cnt] == ' ': pass else: logger.fdebug('[FILECHECKER] ' + str(cnt) + ' Bracket Word: ' + subnm[cnt]) #if ComVersChk == 0: # logger.fdebug('[FILECHECKER] Series version detected as V1 (only series in existance with that title). Bypassing year check') # yearmatch = "true" # break if (subnm[cnt].startswith('19') or subnm[cnt].startswith('20')) and len(subnm[cnt]) == 4: logger.fdebug('[FILECHECKER] year detected: ' + subnm[cnt]) result_comyear = subnm[cnt] ##### - checking to see what removing this does for the masses if int(result_comyear) <= int(maxyear) and int(result_comyear) >= int(comyear): logger.fdebug('[FILECHECKER] ' + str(result_comyear) + ' is within the series range of ' + str(comyear) + '-' + str(maxyear)) #still possible for incorrect match if multiple reboots of series end/start in same year yearmatch = "true" break else: logger.fdebug('[FILECHECKER] ' + str(result_comyear) + ' - not right - year not within series range of ' + str(comyear) + '-' + str(maxyear)) yearmatch = "false" #set to true for mass push check. break ##### - end check cnt+=1 if versionmatch == "false": if yearmatch == "false": logger.fdebug('[FILECHECKER] Failed to match on both version and issue year.') continue else: logger.fdebug('[FILECHECKER] Matched on year, not on version - continuing.') else: if yearmatch == "false": logger.fdebug('[FILECHECKER] Matched on version, but not on year - continuing.') else: logger.fdebug('[FILECHECKER] Matched on both version, and issue year - continuing.') logger.fdebug('[FILECHECKER] yearmatch string is : ' + str(yearmatch)) if yearmatch == "none": if ComVersChk == 0: logger.fdebug('[FILECHECKER] Series version detected as V1 (only series in existance with that title). Bypassing year check.') yearmatch = "true" else: continue if 'annual' in subname.lower(): subname = re.sub('annual', '', subname.lower()) subname = re.sub('\s+', ' ', subname) #if the sub has an annual, let's remove it from the modwatch as well modwatchcomic = re.sub('annual', '', modwatchcomic.lower()) isstitle_chk = False if titlechk: issuetitle = helpers.get_issue_title(ComicID=manual['ComicID'], IssueNumber=justthedigits) if issuetitle: vals = [] watchcomic_split = watchcomic.split() vals = mylar.search.IssueTitleCheck(issuetitle, watchcomic_split, splitit, splitst, issue_firstword, hyphensplit, orignzb=item) logger.fdebug('vals: ' + str(vals)) if vals: if vals[0]['status'] == 'continue': continue else: logger.fdebug('Issue title status returned of : ' + str(vals[0]['status'])) # will either be OK or pass. splitit = vals[0]['splitit'] splitst = vals[0]['splitst'] isstitle_chk = vals[0]['isstitle_chk'] possibleissue_num = vals[0]['possibleissue_num'] #if the issue title was present and it contained a numeric, it will pull that as the issue incorrectly if isstitle_chk == True: justthedigits = possibleissue_num subname = re.sub(' '.join(vals[0]['isstitle_removal']), '', subname).strip() else: logger.fdebug('No issue title.') #tmpitem = item[:jtd_len] # if it's an alphanumeric with a space, rejoin, so we can remove it cleanly just below this. substring_removal = None poss_alpha = subname.split(' ')[-1:] logger.fdebug('[FILECHECKER] poss_alpha: ' + str(poss_alpha)) logger.fdebug('[FILECHECKER] lenalpha: ' + str(len(''.join(poss_alpha)))) for issexcept in issue_exceptions: if issexcept.lower()in str(poss_alpha).lower() and len(''.join(poss_alpha)) <= len(issexcept): #get the last 2 words so that we can remove them cleanly substring_removal = ' '.join(subname.split(' ')[-2:]) substring_join = ''.join(subname.split(' ')[-2:]) logger.fdebug('[FILECHECKER] substring_removal: ' + substring_removal) logger.fdebug('[FILECHECKER] substring_join: ' + substring_join) break if substring_removal is not None: sub_removed = subname.replace('_', ' ').replace(substring_removal, substring_join) else: sub_removed = subname.replace('_', ' ') logger.fdebug('[FILECHECKER] sub_removed: ' + sub_removed) split_sub = sub_removed.rsplit(' ', 1)[0].split(' ') #removes last word (assuming it's the issue#) split_mod = modwatchcomic.replace('_', ' ').split() #batman i = 0 newc = '' while (i < len(split_mod)): newc += split_sub[i] + ' ' i+=1 if newc: split_sub = newc.strip().split() logger.fdebug('[FILECHECKER] split_sub: ' + str(split_sub)) logger.fdebug('[FILECHECKER] split_mod: ' + str(split_mod)) x = len(split_sub) -1 scnt = 0 if x > len(split_mod) -1: logger.fdebug('[FILECHECKER] number of words do not match...aborting.') else: while (x > -1): logger.fdebug(str(split_sub[x]) + ' comparing to ' + str(split_mod[x])) if str(split_sub[x]).lower() == str(split_mod[x]).lower(): scnt+=1 logger.fdebug('[FILECHECKER] word match exact. ' + str(scnt) + '/' + str(len(split_mod))) x-=1 wordcnt = int(scnt) logger.fdebug('[FILECHECKER] scnt:' + str(scnt)) totalcnt = int(len(split_mod)) logger.fdebug('[FILECHECKER] split_mod length:' + str(totalcnt)) try: spercent = (wordcnt /totalcnt) * 100 except ZeroDivisionError: spercent = 0 logger.fdebug('[FILECHECKER] we got ' + str(spercent) + ' percent.') if int(spercent) >= 80: logger.fdebug('[FILECHECKER] this should be considered an exact match.Justthedigits:' + justthedigits) else: logger.fdebug('[FILECHECKER] failure - not an exact match.') continue if comicsize == 0: logger.fdebug('[FILECHECKER] Size of given file is 0 bytes. Ignoring.') continue if manual: #print item #print comicpath #print comicsize #print result_comyear #print justthedigits comiclist.append({ 'ComicFilename': item, 'ComicLocation': comicpath, 'ComicSize': comicsize, 'ComicYear': result_comyear, 'JusttheDigits': justthedigits }) #print('appended.') # watchmatch['comiclist'] = comiclist # break else: if moddir is not None: item = os.path.join(moddir, item) comiclist.append({ 'ComicFilename': item, 'ComicLocation': comicpath, 'ComicSize': comicsize, 'JusttheDigits': justthedigits, 'AnnualComicID': annual_comicid }) #crcvalue = crc(comicpath) #logger.fdebug('[FILECHECKER] CRC is calculated as ' + str(crcvalue) + ' for : ' + item) watchmatch['comiclist'] = comiclist break else: #directory found - ignoring pass logger.fdebug('[FILECHECKER] you have a total of ' + str(comiccnt) + ' ' + watchcomic + ' comics') watchmatch['comiccount'] = comiccnt return watchmatch def validateAndCreateDirectory(dir, create=False, module=None): if module is None: module = '' module += '[DIRECTORY-CHECK]' if os.path.exists(dir): logger.info(module + ' Found comic directory: ' + dir) return True else: logger.warn(module + ' Could not find comic directory: ' + dir) if create: if dir.strip(): logger.info(module + ' Creating comic directory (' + str(mylar.CHMOD_DIR) + ') : ' + dir) try: permission = int(mylar.CHMOD_DIR, 8) os.umask(0) # this is probably redudant, but it doesn't hurt to clear the umask here. os.makedirs(dir.rstrip(), permission) except OSError: raise SystemExit(module + ' Could not create directory: ' + dir + '. Exiting....') return True else: logger.warn(module + ' Provided directory is blank, aborting') return False return False def indices(string, char): return [i for i, c in enumerate(string) if c == char] def traverse_directories(dir): filelist = [] for (dirname, subs, files) in os.walk(dir): for fname in files: if dirname == dir: direc = '' else: direc = dirname if '.AppleDouble' in direc: #Ignoring MAC OS Finder directory of cached files (/.AppleDouble/) continue filelist.append({"directory": direc, "filename": fname}) logger.fdebug('there are ' + str(len(filelist)) + ' files.') #logger.fdeubg(filelist) return filelist def crc(filename): #memory in lieu of speed (line by line) #prev = 0 #for eachLine in open(filename,"rb"): # prev = zlib.crc32(eachLine, prev) #return "%X"%(prev & 0xFFFFFFFF) #speed in lieu of memory (file into memory entirely) return "%X" % (zlib.crc32(open(filename, "rb").read()) & 0xFFFFFFFF)