""" This is three programs in one. USAGE: eft: [...] When invoked as eft.py, this module assumes correct values for the different fronimo fields collected by filling in a form off the website. It makes assumptions about the exact ordering and positioning of fields in the fronimo file. If we add more fields or subtract any in the future, this will have to be dealt with. Also, if the fronimo file structure changes, we will need a new template file as well. If the contributor provides a fronimo file along with the form, then this will read in the fronimo file provided, then write it out to another fronimo file, including data provided in the form that duplicates the standard format of other files on this site. If there is a file provided in another format (such as pdf, museScore, etc.), this creates a new blank fronimo file with the data from the form inserted. Output goes into a directory under the "normalized" name of the contributor, like "contributors/frank_a_gerbode". Each setting goes into its own directory, with a date and time stamp and, perhaps, an abbreviated title. The original contributed file[s] also go[es] into this directory, in a subdirectory "orig". USAGE: cft [ ... ] When invoked as cft.py, it takes each json file and "corrects" the fronimo file given in the "stFile" field of each record in that file with the other data in that record. USAGE: dft.py [-j] [-o ] [[...]] When invoked as dft.py, it dumps data, starting from the directory in which it is invoked, or from one or more other directories given on the command line, and proceeding down the directory tree of each. When invoked with the -j option, by default it dumps the data in json format as dft.json; otherwise in tsv format as dft.tsv. With the -o option, you can specify an output file to write to. """ PYTHONBREAKBOINT = 0 #disallow breakpoints from unicode import latin1_to_ascii import json import pdb import re import os import sys import time import glob import shutil from pprint import pprint, pformat import gzip import csv import platform import inspect from functools import partial #Globals currOS = platform.platform() if currOS.find('Windows') == 0: LOCALBASE = 'C:/website/' #LOCALBASE = 'A:/test/' else: LOCALBASE = '/ssd/home/sarge/prog/python/fron' instCount = 0 #count of new/incorrect instruments partCount = 0 #count of items in partCount that are int in ensemble typeCount = 0 #count of new/incorrect types nameCount = 0 #count of new/incorrect proper names: composers, publishers, anthologists, etc. facCount = 0 #count of unmatched facsimile strings def join_with_fslash(s1, s2): return(s1 + '/' + s2) # end join_with_fslash startTime = int(time.time()) CONTRIBDIR = LOCALBASE + "contributors" NAMESFILE = LOCALBASE + "namedata.tsv" INSTSFILE = LOCALBASE + "instruments.tsv" TYPEFILE = LOCALBASE + "types.tsv" TEMPLATE = LOCALBASE + "templates/template.ft3" FACHEAD = LOCALBASE + "facsimiles/" HTTPBASE = "http://gerbode.net/" # Error files NEWNAMES = LOCALBASE + "newnames.txt" NEWTYPES = LOCALBASE + "newtypes.txt" NEWINSTS = LOCALBASE + "newinsts.txt" DERRFILE = LOCALBASE + "dfterrs.txt" CERRFILE = LOCALBASE + "cfterrs.txt" EERRFILE = LOCALBASE + "efterrs.txt" # to keep track of nonexistent directories, as a time saver. #As more facsimiles are added, this will have to be updated. NOFACDIRS = LOCALBASE + "noFacDirs.txt" FACERRS = LOCALBASE + "facerrs.txt" # directories where we don't look for fronimo files BADDIRS = ['midi', 'tabs', 'pdf', 'other', 'videos', 'TEMP', 'old_dft.pls', 'fronimo', 'images', 'icons', 'index_files', 'ftp', 'facsimiles', 'making_lute_music_accessible_files', 'templates'] # output files TSVFILE = LOCALBASE + "dft.tsv" JSONFILE = LOCALBASE + "dft.json" # For converting RTF files to latin1 and vice versa RTFPREFIX = "{\\rtf1\\ansi\\ansicpg1252\\deff0\\deflang1033{\\fonttbl{\\f0\\fnil\\fcharset0 MS Shell\r\nDlg;}}\r\n\\viewkind4\\uc1\\pard\\f0\\fs-22 "; RTFSUFFIX = "\\par\r\n}\r\n"; IMAGETYPES = ('.png', '.tif', '.pdf', '.jpg') # Compiled regular expressions #use re.DOTALL to match \n as well # = number in a series; % = arbitrary number to make source unique reUnicode = re.compile('(\\\\u.... )') reSplitAndOr = re.compile('(..*) (?:and|or) (..*)') reGetDate = re.compile('\D$(c?a?\.? ?[1-2][0-9]{3})$', re.DOTALL) reApproxDate = re.compile('(ca?\.? ?)[1-2][0-9]{3}', re.DOTALL) reParenContents = re.compile('^([^(]*)$([^)]+)$(.*)', re.DOTALL) reEntabulated = re.compile('([IiEe]ntabulated)', re.DOTALL) reEncoded = re.compile('[Ee](ncoded)', re.DOTALL) reEdited = re.compile('[Ee](dited)', re.DOTALL) reEnc = re.compile(r'[Ee](nc\.)', 1) reEd = re.compile(r'[Ee](d\.)', 1) reInfo = re.compile('^(...).*?: *(..*)$', re.DOTALL) reKey = re.compile('^[A-G][b#]*[Mm]$',re.DOTALL) reGetDictItem = re.compile('"([^"]*)"[^"]*"([^"]*)"') reDeLang = re.compile(r'(\\lang[0-9]{4)}') # These are for error messages # for current func name, specify n = 0 or no argument. # for name of caller of current func, specify 1. # for name of caller of caller of current func, specify 2. etc. funcName = lambda n=1: sys._getframe(n + 1).f_code.co_name callerName = lambda n=2: sys._getframe(n + 1).f_code.co_name lineNo = lambda n=1: sys._getframe(n + 1).f_lineno # insert a string into another string at a specific location def insert_str(stInsert, str, index): return str[:index] + strInsert + str[index:] def at_eof(f): return f.tell() == os.fstat(f.fileno()).st_size def show_tuple(t): for item in t: print("%s, " % item, end = "") print("") difficulties = ["??", "Beginner", "Easy", "Medium", "Challenge", "Difficult", "Virtuoso"] field_map = [ ("Title", "title"), ("Subtitle", "subtitle"), ("Composer", "composer"), ("Orig. composer", "composer0"), ("Footnote", "footnote"), ("Source", "Source"), ("Document", "document"), ("Volume", "volume"), ("Date", "date"), ("Page", "page"), ("Editor", "editor"), ("Encoder", "encoder"), ("Arranger", "arranger"), ("Intabulator", "intabulator"), ("Concordances", "concordances"), ("Contributor", "contributor"), ("Info", "info"), ("Piece", "piece"), ("Section", "section"), ("Type", "type"), ("Key", "key"), ("Difficulty", "difficulty"), ("Ensemble", "ensemble"), ("Part", "part"), ("Remarks", "remarks"), ("Recording", "recurl"), ("Facsimile", "facurl"), ("Fronimo", "stFile"), ("PDF", "stPdf"), ("Midi", "stMidi"), ("Modified", "mtime"), ("Created", "ctime"), ] key_order = [ # for dumping "title", "subtitle", "composer", "composer0", "footnote", "source", "document", "volume", "date", "page", "editor", "encoder", "arranger", "intabulator", "contributor", "info","concordances", "piece", "section", "type", "key", "difficulty", "ensemble", "part", "remarks","recurl", "facurl", "stFile", "stPdf", "stMidi", "ctime", "mtime", ] #Creates fronimo object from file with name stIn class Fronimo: def __init__(self, stIn = TEMPLATE): # set all attributes to empty string for col,att in field_map: setattr(self, att, "") self.base = LOCALBASE self.currProg = '' self.credits = '' self.begText = '' self.endText = '' self.performance = False self.simple = False self.ornamented = False self.footnote = "" #input file name set from argument stIn self.stFile = stIn self.stFron = '' self.flFron = None self.flOut = None self.flErr = None self.offset = None self.oldOffset = None self.volume = "" self.info = "" self.recurl = "" self.facurl = "" #keep track of padding of the last dir contacted that contains numerical pages #self.lastPageDir = ["", 0] # Creates self.stFron from fronimo file: self.stFile if self.read_and_unzip_file() == False: # marker for failure of class instantiation self.start_offset = -1 else: self.start_offset = self.get_start_offset() if self.start_offset > 0: # Fills in other values of fronimo object. if self.populate() == False: self.start_offset = -1 # end of __init___ # Open error file @classmethod def open_error(kls, errfile): try: kls.flErr = open(errfile, "w", encoding = 'latin1') except OSError: print("OSError: Cannot open error File ", errfile, " for writing.", file=sys.stderr) return False except: print("Other error: Cannot open error File ", errfile, " for writing.", file=sys.stderr) return False return True # End open_error @classmethod def open_file(kls, fl, mode): try: flName = open(fl, mode, encoding='latin1') except OSError: err= "OSError: Cannot open file %s in mode %s." % (fl, mode) kls.print_error(fl, err) return None except: err= "Other error: Cannot open file %s in mode %s." % (fl, mode) kls.print_error(fl, err) return None return flName # end of openFile # open and read all needed fronimo-related files @classmethod def open_files(kls): kls.nameList = [] kls.typeList = [] kls.instList = [] kls.noFacList = [] # Read in and process list of recognized names kls.flNames = kls.open_file(NAMESFILE, "r") if not kls.flNames: print("Cannot open NAMESFILE file.") return False stFacNames = kls.flNames.read() lsIn = stFacNames.split('\n') for line in lsIn: lsRec = line.split('\t') if line == '': continue lsRec = line.split('\t') # First, get the default directory name for this proper name source # That's capitalized last name + capitalized first letter of of first name # Get the first and last names from 2nd list record lastFirst = lsRec[1] if (',' in lastFirst): val = re.search('^([^,][^,]+), *(..*)', lastFirst) last = val.group(1) first = val.group(2) slug = last + first[0] else: # Unless there is no first name slug = lsRec[0] # Remove accents slug = latin1_to_ascii(slug) # Prepend it to the list record lsRec.insert(0, slug) # and add the record to the names list. kls.nameList.append(lsRec) # Read in instrument list kls.flInsts = kls.open_file(INSTSFILE, "r") if not kls.flInsts: print("Cannot open INSTSFILE file.") return False stInst = kls.flInsts.read() lsIn = stInst.split('\n') for line in lsIn: lsRec = line.split('\t') kls.instList.append(lsRec) # Read in types list kls.flTypes = kls.open_file(TYPEFILE, "r") if not kls.flTypes: print("Cannot open TYPEFILE file.") return False stType = kls.flTypes.read() lsIn = stType.split('\n') for line in lsIn: lsRec = line.split('\t') kls.typeList.append(lsRec) #Read in past references to nonexistent facsimiles #A time saver, but needs to be updated as facsimiles are added kls.flNoFacDirs = kls.open_file(NOFACDIRS, "a+") if not kls.flNoFacDirs: print("Cannot open NOFACDIRS file.") return False # Read in the whole list as is # Go to the beginning to read it in kls.flNoFacDirs.seek(0) stDirs = kls.flNoFacDirs.read() # Go to the end to append; maybe unnecessary... kls.flNoFacDirs.seek(2, 0) kls.noFacList = stDirs.split('\n') #Open error lists kls.flNewNames = kls.open_file(NEWNAMES, "w") if not kls.flNewNames: print("Cannot open NEWNAMES file.") return False kls.flNewTypes = kls.open_file(NEWTYPES, "w") if not kls.flNewTypes: print("Cannot open NEWTYPES file.") return False kls.flNewInsts = kls.open_file(NEWINSTS, "w") if not kls.flNewInsts: print("Cannot open NEWINSTS file.") return False kls.flFacErrs = kls.open_file(FACERRS, "w") if not kls.flFacErrs: print("Cannot open FACERRS file.") return False return True # end open_files @classmethod def print_headers(kls): if kls.fJson: print("[", file=kls.flJson) else: print("Title\tSubtitle\tComposer\tOrig. composer\tSource\tDocument\tVolume\tDate\tPage\tEditor\tEncoder\tArranger\tIntabulator\tContributor\tConcordances\tPiece\tSection\tType\tKey\tDifficulty\tEnsemble\tPart\tRemarks\tRecording\tFacsimile\tFronimo\tPDF\tMidi\tCreated\tModified", file=kls.flTsv) # end print_headers # for future use @classmethod def make_contrib_dir(kls): s= self.contributor.casefold() s = s.replace('.', '') s - s.replace(' ', '_') self.contribDir = join_with_fslash(CONTRIBDIR, s) if not os.path.exists(self.contribDir): os.makedirs(self.contribDir) # Get canonical name for type @classmethod def find_canonical_type(kls,typ): typ = typ.strip() typ = typ.lower() for item in kls.typeList: if item[0] == typ: return item[1] return None #end find_canonical_type # Finds all types in a hierarchy below a given canonical type @classmethod def get_all_types(kls, typ): typOut = [typ] for t in kls.typeList: if len(t) == 3: #only look at items that have higher types # if there is an "and" tpH = t[2].split('&') # See if the search pattern matches one of the alternatives if typ in tpH: newType = kls.get_all_types(t[1]) if newType: # No duplications allowed if not newType in typOut: typOut = typOut + newType return typOut # Starts with a comma-separated list of types, validates them, #gets canonical name, # and returns the validated list with all sub-types @classmethod def get_type_list(kls, typesIn): typelist = typesIn.split(',') typesOut = [] for typ in typelist: t = kls.find_canonical_type(typ) if t == None: print("Type %s not found." % typ) # Fronimo.print_error("Type %s not found." % typ) continue else: typesOut += kls.get_all_types(t) return(typesOut) # print helpful error message @classmethod def print_error(kls, currFile, errMsg): print("In %s; caller:%s; line:%d of %s\nfile:%s; %s." % (funcName(), callerName(), lineNo(), kls.currProg, currFile, errMsg), file=kls.flErr) kls.flErr.flush() # correct all the files given in a json file @classmethod def correct_all(kls, jFile): count = 0 # Open name- instrument- and type-related files Fronimo.open_files() f = Fronimo.open_file(jFile, "r") line = f.readline() # Find and discard the '[' at the beginning of the file while not at_eof(f) and not line.find('[') in [0,1,2,3,4,5]: line = f.readline() while not at_eof(f) and not line.find(']') in [0,1,2,3,4,5]: # Find and discard { at beginning of json record while not at_eof(f) and not line.find('{') in [0,1,2,3,4,5]: line = f.readline() if at_eof(f): return(count) dChanges = {} # Read the json record up to the "}" and load dChanges while not at_eof(f) and not line.find('}') in [0,1,2,3,4,5]: # Find the first quote while not at_eof(f) and not line.find('"') in [0,1,2,3,4,5]: line = f.readline() if at_eof(f): return(count) line = line.replace('\\"', '\xA4') item = reGetDictItem.search(line) if item == None: Fronimo.print_error(kls.stFile, "cannot parse json line, %s.", line) continue typ = item.group(1) val = item.group(2).replace('\xA4','"') dChanges[typ] = val line = f.readline() # Load fron file corresponding to this record fronFile = dChanges['stFile'] if not os.path.isfile(fronFile): print("%s does not exist" (fronFile)) continue fron = Fronimo(fronFile) #fron.stFron now created from this fronimo file. # and fronimo object fron populated with values from that file. # punt if corresponding file not found if fron.start_offset == -1: continue # Save old fronimo string. stOldFron = fron.stFron # Load changes from json record into fron, replacing old values fron.load_changes(dChanges) if not fron.depopulate(): Fronimo.print_error(kls.stFile,"Cannot update fronimo file string.") continue # if no changes to make, do nothing. if stOldFron == fron.stFron: continue if not fron.write_file(False): Fronimo.print_error(kls.stFile, "Cannot write out fronimo file.") continue count += 1 line = f.readline() return count #end correct_all #print missing fax def no_fac(self, location): # No point in printing out facsimile directory location location = location.replace(FACHEAD, '') ft3_file = self.stFile.replace(LOCALBASE, '') ft3_file = ft3_file.replace('composers', 'cmps') ft3_file = ft3_file.replace('sources', 'srcs') msg = ft3_file + ' ||| ' + location print(msg, file=Fronimo.flFacErrs) # Get starting offset for reading/writing a fronimo file def get_start_offset(self): if (self.stFron[4] == '\x15') or (self.stFron[4] == '\x14'): return 364 elif self.stFron[4] == '\x16': return 368 else: val = hex(ord(self.stFron[4])) msg = "stFron[4] = %s (not \\x15 or \\x16), so starting offset is unknown" % (val) Fronimo.print_error(self.stFile, msg) return -1 # end get_start_offset # load changes from a json record into a Fronimo class instance def load_changes(self, dChanges): for k,v in dChanges.items(): setattr(self, k, v) # end of load_changes # read and unzip a fronimo file with name self.stFile into self.stFron def read_and_unzip_file(self): self.flFron = gzip.open(self.stFile) if self.flFron == None: Fronimo.print_error(self.stFile, 'Cannot open and unzip fronimo file.') return False # Read entire file into a global byte array btFron = self.flFron.read() if len(btFron) < 100: Fronimo.print_error(self.stFile, 'Cannot read fronimo file.') return False self.stFron = btFron.decode("latin1") if len(self.stFron) < 100: Fronimo.print_error(self.stFile, 'Cannot decode fronimo file.') return False return True # end of read_and_unzip_file # get x number of chars from string. Updates offset value def _get(self, numChars): if numChars == 0: return "" oldOffset = self.offset self.offset += numChars if (self.offset) > len(self.stFron): errMsg = "Cannot get %d chars starting at offset %d" % (numChars, oldOffset) Fronimo.print_error(self.stFile, errMsg) return "" return self.stFron[oldOffset:self.offset] # Get two bytes of info def _getWord(self, fSigned): inCh = self._get(1) if inCh == "": errMsg = "No first byte in stFron at offset %d" % (self.offset) Fronimo.print_error(self.stFile, errMsg) return -1 word1 = ord(inCh) inCh = self._get(1) if inCh == "": errMsg = "No 2nd byte in stFron at offset %d" % (self.offset) Fronimo.print_error(self.stFile, errMsg) return -1 word2 = 256 * ord(inCh) word = word1 + word2 if (fSigned and word > 32768): word -= 65536 return word # Get a fronimo-formatted string def _getBstr(self): # first byte is string length if < 255 firstByte = self._get(1) if firstByte == False: errMsg = "No first byte in stFron at offset %d" % (self.offset) Fronimo.print_error(self.stFile, errMsg) return "" length = ord(firstByte) if length == 0: return "" # First char 255 means a long string. # Next 2 chars determine string length as an unsigned integer if length == 255: length = self._getWord(False) if length == -1: errMsg = "Zero string length from _getWord, offset %d" % (self.offset) Fronimo.print_error(self.stFile, errMsg) return "" stOut = self._get(length) if stOut == "": errMsg = "Result of get(length) is "", at offset %d" % (self.offset) Fronimo.print_error(self.stFile, errMsg) return "" return stOut def _put(self, stNew): # Assumes new string is RTF'd, if necessary, but not in fronimo string format, # with leading length indicator # Find length of old fronimo string ch = self._get(1) if ch == '\xFF': # means next 2 chars determine length byte1 = self._get(1) if byte1 == "": errMsg = "No first byte in stFron at offset %d" % (self.offset) Fronimo.print_error(self.stFile, errMsg) return False word1 = ord(byte1) byte2 = self._get(1) if byte2 == "": errMsg = "No second byte in stFron at offset %d" % (self.offset) Fronimo.print_error(self.stFile, errMsg) return False # second byte is higher order word2 = 256 * ord(byte2) # Length of actual string + 3 bytes to specify the length length = word1 + word2 + 3 # reset offset to compensate for 3 _get(1)'s self.offset -= 3 else: # Length of actual string + 1 byte to specify the length length = ord(ch) + 1 # reset offset to compensate for 1 _get(1) self.offset -= 1 # find length of new string newLen = len(stNew) if newLen < 255: stInsert = chr(newLen) # We will add one length indicator to the head of the string newLen += 1 else: stInsert = chr(255) + chr(newLen % 256) + chr(int(newLen / 256)) # We will add three length indicators to the head of the string newLen += 3 # Add leading length indicator stNew = stInsert + stNew # splice in the new string stOut = self.stFron[:self.offset] + stNew + self.stFron[self.offset + length:] # set new offset self.offset += newLen self.stFron = stOut return True # end _put def latin2rtf(self, stIn): stOut = "" for i in range (len(stIn)): s = stIn[i] if ord(s) >= 127: # hex value of latin1 char --> last 2 chars of rtf code hexchars = hex(ord(s)) hexchars = hexchars.replace('0x', '') s = "\\\'" + hexchars elif s == '|': s = '\\par\r\n' stOut += s return RTFPREFIX + stOut + RTFSUFFIX def replace_rtf_codes(self, stIn): accentLoc = stIn.find('\\\'') while accentLoc >= 0: # Get last 2 characters of string \'xx = hex char value rtfCode = stIn[accentLoc + 2:accentLoc + 4] # convert to latin1 character latin1 = chr(int(rtfCode, 16)) if latin1: stIn = stIn.replace(rtfCode, latin1, 1) stIn = stIn.replace("\\'", '', 1) accentLoc = stIn.find('\\\'') stIn = stIn.replace('\\par', '|') stIn = stIn.replace('\\cf1', '') stIn = stIn.replace('\\cf0', '') stIn = re.sub(r'\\lang[0-9]*', '', stIn) stIn = stIn.replace(' ', ' ') # KLUDGE to handle unicode weirdness val = reUnicode.search(stIn) if val: uni = val.group(2) # lop off final character from unicode sequence stIn = stIn.replace(uni, uni[:-1]) return(stIn) def rtf2latin(self, stRtf): if stRtf.find('{\\rtf', 0) != 0: return stRtf if len(stRtf) < 100: return stRtf start = stRtf.find('\\f0\\fs', 0) if start == -1: return stRtf start += 9 end = stRtf.find('\\par\r\n}\r\n', start + 1) if end == -1: return stRtf stOut = stRtf[start:end] stOut = stOut.replace('\\par\r\n', '\n') stOut = stOut.replace('\\{', '{') stOut = stOut.replace('\\}', '}') # KLUDGE to get rid of \langxxxx peculiarity that sometimes shows up # Might want to reinstate this if we can figure out what it means val = reDeLang.search(stOut) if val: lang1234 = val.group(1) stOut = stOut.replace(lang1234, "") stOut = stOut.strip(' \t') stOut = self.replace_rtf_codes(stOut) return stOut def get_composer0(self): if self.subtitle == '': return '' #Leave subtitle intact but mine for composer0 pcont = reParenContents.search(self.subtitle) if pcont: inParen = pcont.group(2) hyphenIndex = inParen.find(' - ') if hyphenIndex != -1: inParen = inParen[hyphenIndex + 3:] else: return '' return inParen def parse_document(self, doc): if doc == "": Fronimo.print_error(self.stFile, "No document") return False val = reGetDate.search(doc) # We have a date if val: self.date = val.group(1) self.date = self.date.strip() val = reApproxDate.search(self.date) if val: caMark = val.group(1) if caMark != "": self.date = self.date.replace(caMark, "") self.date = "c." + self.date val = (re.search('$c?a?\.? ?[0-9]{4}$, *([fp#%]{1,2}\.* *[^.]*)\.?$', doc, re.DOTALL)) if val: self.page = val.group(1) self.page = self.page.replace(" ", "") else: self.page = "" # this is not really an error, per se. # Fronimo.print_error(self.stFile, "No page # in %s" % doc) else: self.date = "" Fronimo.print_error(self.stFile, "Cannot get date from %s" % doc) # Look for a page anyway (unlikely) val = re.search(', *([fp#%]{1,2}\.* *.*)\.$', doc, re.DOTALL) if val: self.page = val.group(1) else: self.page = "" Fronimo.print_error(self.stFile, "No page # in %s" % doc) # Get document without date and page val = re.search(' ?$c?a?\.? ?[12][0-9]{3}$', doc) if val: end = val.span()[0] self.document = doc[:end] else: self.document = doc self.document = self.document.strip() val = re.search("([^,][^,]*), *v[. ] *([^,]+)", self.document) if val == None: self.volume = '' else: self.document = val.group(1) self.volume = val.group(2) return True #end of parse footnote def slugify(self,stIn): stIn = stIn.lower() stIn = latin1_to_ascii(stIn) stIn = re.sub(': *', '_', stIn) stIn = stIn.replace("'", '_') stIn = re.sub(' +', '_', stIn) return(stIn) def make_pagedir(self): # document and volume obtained in parse_footnote # and self.source specified in populate if not in parse_footnote # Handle the source field first src = self.source if re.match('[A-Z]+-[A-Z]+[a-z]*', src): #it's a library, so slugify it src = self.slugify(src) else: # It's a proper name: a composer, intabulator, publisher, anthologist, etc. # First check if it is in names list src = src.replace('?', '') src = src.replace('\n', '') found = False for item in self.nameList: if item[1] == src: #slugify it; item[0] contains existing slug src = item[0] found = True break if not found: # add to list of unknown names self.write_new_name(src) # So there won't be a knowable facsimile directory # if no knowable source field # So no point in continuing return '' # Now handle document field document = self.slugify(document) facurl = FACHEAD + src + '/' + document # 2 different handlings depending on whether there is a volume if self.volume: # make entry for volume directory, appending date # assumes a document directory doesn't have appended date when there are volumes. # and the volume files carry the date stVol = "v." + self.volume + '_' + self.date pageDir = facurl + '/' + stVol else: # assumes document directory has appended date if no volumes pageDir = facurl + '_' + self.date return pageDir # end of make_pagedir def parse_credits(self, cred): # expand abbreviations [Ee]d. [Ee]nc, and & cred = cred.replace(' & ', ' and ') cred = re.sub('\.$', '', cred) if reEd.search(cred): cred = cred.replace('d.', 'dited', 1) if reEnc.search(cred): cred = cred.replace('nc.', 'ncoded', 1) # Change [IiEe]ntabulated to Encoded val = reEntabulated.search(cred) if val: src = val.group(1) cred = cred.replace(src, "Encoded") #Handle "by", "and", and ";" in credits string val = re.search('[Ee](?:dited|ncoded) and [Ee](?:dited|ncoded) by (..*)$', cred, re.DOTALL) if val: if val.group(1) == 'S.Gerbode': self.editor = self.encoder = 'Sarge Gerbode' else: self.encoder = val.group(1) self.encoder = self.encoder.replace('S.Gerbode','Sarge Gerbode') self.editor = self.encoder return True val = re.search('(E(?:ncoded|dited)) by (..+) ?(?:[;.]|and) ([Ee](?:dited|ncoded)) by (..*)$', cred, re.DOTALL) if val: type1 = val.group(1) cred1 = val.group(2) type2 = val.group(3) cred2 = val.group(4) if type1 == "Edited": self.editor = cred1.strip() self.encoder= cred2.strip() else: self.encoder = cred1.strip() self.editor = cred2.strip() self.editor = self.editor.replace('S.Gerbode', 'Sarge Gerbode') self.encoder = self.encoder.replace('S.Gerbode', 'Sarge Gerbode') return True self.editor = self.encoder = "" return False # End parse_credits # sets source, document, volume, date, page, encoder, editor def parse_footnote(self): self.source = self.document = self.volume = self.date = self.page = self.encoder = self.editor = "" if self.footnote == "": return lsParts = re.split(' +', self.footnote) numParts = len(lsParts) if numParts < 2 or numParts > 3: stErr = "Footnote \"%s\" has wrong # of parts (%d)" % (self.footnote, numParts) Fronimo.print_error(self.stFile, stErr) return False if numParts == 2: # source == ''; later, source will = composer doc, cred = lsParts else: self.source, doc, cred = lsParts # return False if not self.parse_document(doc): Fronimo.print_error(self.stFile, "Cannot parse document:%s" % doc) return False if not self.parse_credits(cred): Fronimo.print_error(self.stFile, "Cannot parse credits: %s." % cred) return True # End parse_footnote def get_diff_val(self, difficulty): diff = difficulty.strip() if diff: diff = difficulty[:3] # convert to lower case diff = diff.lower() else: diff = '3' # Medium is default difficulty value if diff in ['0', '1', '2', '3', '4', '5', '6']: return ord(diff) - 48 elif diff == "beg": return 1 elif diff in ('eas', 'sim'): return 2 elif diff == 'med': return 3 elif diff == 'cha': return 4 elif diff in ('dif', 'har'): return 5 elif diff in ('vir', 'kil'): return 6 else: stErr = "Difficulty value \"%s\" not meaningful" % (difficulty) Fronimo.print_error(self.stFile, stErr) return 0 def parse_info(self): # initialize with existing values global instCount global partCount lsInfo = re.split('\n', self.info) isRemark = False remarks = "" for datum in lsInfo: datum = datum.strip() # Ignore blank lnes if datum == "": continue if isRemark: if remarks: remarks = remarks + '|' + datum else: remarks = datum else: isRemark = (datum.find('--') == 0) if isRemark: # everything after line stating with '--' is a remark continue if datum.find(':') == -1: continue # so it is a field # decode it reItem = reInfo.search(datum) if reItem: field = reItem.group(1) value = reItem.group(2) value = value.strip() field = field.lower() if field in ["tra", "rea", "arr"]: self.arranger = value elif field in ["lib", "sou", "pub" ]: self.source = value elif field in["ins", "ens" ]: self.ensemble = value elif field in ["doc"]: self.document = value elif field in ["ori", "co0"]: self.composer0 = value elif field == "tit": self.title = value elif field == "sub": self.subtitle = value elif field == "com": self.composer = value elif field == "doc": self.document = value elif field == "pag": self.page = value elif field == "edi": self.editor = value elif field == "enc": self.encoder = value elif field == "int": self.intabulator = value elif field == "con": self.concordances = value elif field == "pie": self.piece = value elif field == "fac": self.facsimile = value elif field == "rec": self.recording = value elif field == "sec": self.section = value elif field == "typ": self.type = value elif field == "key": self.key = value # if reKey.search(self.key) == None: # stErr = "Key \"%s\" missing or meaningless" % (self.key) # Fronimo.print_error(self.stFile, stErr) # self.key = "??" elif field == "dif": self.difficulty = self.get_diff_val(value) elif field == "par": self.part = value else: stErr = "Info field \"%s\ ""not found" %(field) Fronimo.print_error(self.stFile, stErr) else: continue self.remarks = remarks # checking to see of all parts list items are in the ensemble list. # first collect all items in the ensemble, including tags ensList = self.ensemble.split(",") insList = [] ensItems = [] for ens in ensList: ens = ens.strip() ens = ens.lower() if ens.find(":") > 0: instag = ens.split(":") # Include tags for part check but not for instrument check ensItems.append(instag[0].strip()) ensItems.append(instag[1].strip()) insList.append(instag[1].strip()) else: insList.append(ens) ensItems.append(ens) # Then check part list item against them if self.part != "": partList = self.part.split(",") for ins in partList: ins = ins.strip() ins = ins.lower() if ins == "score": continue if not ins in ensItems: stErr = "Part \"%s\" not in ensemble list" % (ins) partCount += 1 Fronimo.print_error(self.stFile, stErr) for ins in insList: found = False ins = ins.strip() ins = ins.lower() for item in Fronimo.instList: if item[0] == ins: found = True break if not found: instCount += 1 stErr = "Inst. \"%s\" not found in \"%s\"." % (ins, self.stFile) print(stErr, file=Fronimo.flNewInsts) Fronimo.flNewInsts.flush() # end of for datum in lsInfo return True # end of parse_info def write_new_name(self, newName): global nameCount nameCount += 1 stOut = "%s --> %s" % (newName, self.stFile) print(stOut, file=Fronimo.flNewNames) Fronimo.flNewNames.flush() def check_name(self, name): found = False name = name.replace('?', '') name = name.replace('\n', '') # write out list of names not found in names list val = reSplitAndOr.search(name) if val: lsName = [val.group(1)] + [val.group(2)] elif name: lsName = [name] else: lsName = [] for nm in lsName: found = False for item in Fronimo.nameList: if item[1] == nm: found = True break if not found: self.write_new_name(nm) return False return found # end of check_name def write_new_types(self): # get values from comma-separated list types = self.type.split(",") for typ in types: found = False typ = typ.strip() typ = typ.lower() typ = typ.replace("?", "") for item in Fronimo.typeList: if item[0] == typ: found = True break if not found: global typeCount typeCount += 1 stOut = "%s --> %s" % (typ, self.stFile) print(stOut, file=Fronimo.flNewTypes) Fronimo.flNewTypes.flush() # end of write_new_types # Get 0 padding for pages in the directory. # returns [pad, suffix] def get_pad(self, pageDir): suffix = '' pad = 0 lastPage = 0 #get a list of page file names arPage = os.listdir(pageDir) for page in arPage: if not page[-4:] in ['.png', '.tif', '.pdf', '.jpg']: continue suffix = page[-4:] page = page[:-4] # strip position on page designation if page[-1] in list('abcdefghijklm'): pagePos = page[-1] page = page[:-1] else: pagePos = '' # Get the last page number if page.isdigit(): currPage = int(page) if currPage > lastPage: lastPage = currPage # file numbers are 0 padded pad = 0 if lastPage >= 100: pad = 3 elif lastPage >= 10: pad = 2 elif lastPage >= 1: pad = 1 return [pad, suffix] #end get_pad # Convert self.page into a valid directory entry def get_page_entry(self, pad): # blank page or page containing % or # guarantees no facsimile file if self.page == '' or re.search('[#%]', self.page) != None: return '' val = re.search('^[fp][fp]*\. *([^,.][^,.]*)', self.page) if val == None: errmsg = "cannot parse page # %s" % (self.page) Fronimo.print_error(self.stFile, errmsg) return '' else: page = val.group(1) # strip position on page designation if page[-1] in list('abcdefghijklm'): lastChar = page[-1] page = page[:-1] else: lastChar = '' if page[-1] == 'v': verso = 'v' page = page[:-1] else: verso = '' # Special case for leading pages like 00a, 000b, etc. # or like a2, l4v, etc. if (not page.isdigit()) or (int(page) == 0): return page + verso if pad > 1: page = page.zfill(pad) return(page + verso + lastChar) # end get_page_entry def make_pagedir(self): # document and volume obtained in parse_footnote # and self.source specified in populate if not in parse_footnote # Handling the source field src = self.source if re.match('[A-Z]+-[A-Z]+[a-z]*', src): # Get rid of apostrophes, colons, and spaces in library name src = self.slugify(src) else: # It's a proper name, a composer, intabulator, publisher, anthologist, etc. # First check if it is in names list found = False for item in self.nameList: if item[1] == src: #slugify it src = item[0] found = True break if not found: # it's not going to have a facurl if src is a compound name # but make sure we pick up any stray names anyway self.check_name(src) return('') # lower case document,get rid of accents and replace spaces and colons with underlines document = self.slugify(self.document) facurl = FACHEAD + src + '/' + document # 2 different handlings if there is a volume if self.volume: # make entry for volume directories, appending date # assumes a document directory doesn't have appended date when there are volumes. stVol = "v." + self.volume + '_' + self.date pageDir = facurl + '/' + stVol else: # assumes document directory has appended date if no volumes pageDir = facurl + '_' + self.date return pageDir # end of make_pagedir def get_facurl(self, svPageDir = [''], svPad = [0], svSuffix = ['']): oldPageDir = svPageDir[0] oldPad = svPad[0] oldSuffix = svSuffix[0] # make directory that contains pages pageDir = self.make_pagedir() # No directory containing pages = no facsimile to find #is it a new page directory? if pageDir != oldPageDir: svPageDir[0] = pageDir # have we seen this before as pointing nowhere? if pageDir == '' or pageDir in self.noFacList: return '' # Set static variable for next time # check if directory exists if not os.path.isdir(pageDir): # add it to list of missing page directories self.noFacList.append(pageDir) # and also write it out to facsimile dud directory list for future reference self.flNoFacDirs.write(pageDir + '\n') svPad[0] = 0 svSuffix = '' svPageDir = '' return '' # get new padding and suffix for the new page directory [pad, suffix] = self.get_pad(pageDir) svPad[0] = pad svSuffix[0] = suffix else: # directory hasn't changed, so we assume the old padding and suffix data still applies pad = oldPad suffix = oldSuffix # get page file name without suffix in file directory from self.page, 0 padded if appropriate page = self.get_page_entry(pad) # no page, no facurl if page == '': return '' facurl = pageDir + '/' + page + suffix if os.path.isfile(facurl): return facurl else: self.no_fac(facurl) return '' # end of get_facurl #Starts with an uncompressed fronimo string (stFron) and reads values into # an instance of a fronimo class object. def populate(self): #Assume we can parse footnote unless proven otherwise. global facCount fFootnote = True self.offset = self.start_offset pg = self._getBstr() # Skip page number string self.footnote = self._getBstr() self.footnote = self.footnote.strip() if not self.parse_footnote(): Fronimo.print_error(self.stFile, "Cannot parse footnote:\n%s" % self.footnote) fFootnote = False newPos = self.stFron.find('CPiece') # This is very unlikely to happen if newPos == -1: Fronimo.print_error(self.stFile, "Cannot find pattern 'CPiece'") return False self.offset = newPos + 14 self.title = self._getBstr() self.title = self.title.strip(' \t') self.title = self.rtf2latin(self.title) self.subtitle = self._getBstr() self.subtitle = self.subtitle.strip('[\t ]') self.subtitle = self.rtf2latin(self.subtitle) self.subtitle = self.subtitle.replace('\n', '|') self.composer0 = self.get_composer0() self.composer = self._getBstr() self.composer = self.composer.strip('[\t ]') self.composer = self.rtf2latin(self.composer) if self.source == "": self.source = self.composer if self.source == 'Anonymous': self.source = 'Unknown' self._getBstr() # Discard text at beginning and end of section self._getBstr() self.info = self._getBstr() self.info = self.info.strip() self.info = self.info.replace ('\r', '') # Skip key field; will get key from info field # offset = stFron.find('CBAR',offset) + 10 # keyNum = self._getWord(True) self.composer = self.composer.replace('Anon.', 'Anonymous') self.composer0 = self.composer0.replace('Anon.', 'Anonymous') if self.info: if not self.parse_info(): Fronimo.print_error(self.stFile, "Cannot parse info string.") else: Fronimo.print_error(self.stFile, "No info data.") self.mtime = int(os.path.getmtime(self.stFile)) self.ctime = int(os.path.getctime(self.stFile)) self.stMidi = self.find_file("midi") self.stMidi = self.stMidi.replace(LOCALBASE, self.base, 1) self.stPdf = self.find_file("pdf") self.stPdf = self.stPdf.replace(LOCALBASE, self.base, 1) #test to see if facsimile file exists #only if footnote parse ws OK. if fFootnote: self.facurl = self.get_facurl() if not self.facurl: facCount += 1 for name in [self.composer, self.composer0]: if name: self.check_name(name) self.write_new_types() return True # End Populate def printout(self, lsPrint): fDidOne = False if self.fJson: print('{', file=self.flJson) for a in lsPrint: if self.fJson: # Escape the quotes value = a[1] if isinstance(value, str): value = value.replace('"', '\\"') print('"%s" : "%s",' % (a[0], value), file=self.flJson) else: if fDidOne: print('\t', end = '', file=self.flTsv) if a[0] == 'composer' or a[0] == 'composer0' or a[0] == 'subtitle': comp = a[1] comp = comp.replace('|', '') print(comp, end = '', file=self.flTsv) else: print(a[1], end = '', file=self.flTsv) fDidOne = True if self.fJson: print('},', file=self.flJson) self.flJson.flush() else: print("", end = '\n', file=self.flTsv) self.flTsv.flush() # end of printout def write_data(self): # Write values for all named columns vals = [] # get printable values & clean them up # for col,att in field_map: # if not col: # continue for att in key_order: val = getattr(self, att) if isinstance(val, str): val = val.replace('\n', '|') val = val.replace("\x92", "'") val = val.replace("\x96", "-") # print out footnote and info values only to json file if val in [self.footnote, self.info]: if self.fJson: vals.append((att, val)) else: vals.append((att, val)) self.printout(vals) #end write_data def find_file(self, type): fronDir = os.path.dirname(self.stFile) fronDir = fronDir.replace('\\', '/') fronType = os.path.basename(self.stFile) fronType = fronType.replace('\\', '/') if type == "midi": fronType= fronType.replace('.ft3', '.mid') elif type == 'pdf': fronType = fronType.replace('.ft3', '.pdf') else: fronType = fronType.replace('.ft3', '.tab') typeDir = join_with_fslash(fronDir, type) stFl = join_with_fslash(typeDir, fronType) if not os.path.exists(stFl): return("") else: stFl.replace('\\', '/') return(stFl) # End find_file # Takes data from a fronimo class instance and inserts it into existing stFron def depopulate(self): self.offset = self.start_offset # throw away page number string and update offset pg = self._getBstr() self._put(self.footnote) newPos = self.stFron.find('CPiece') # This is very unlikely to happen if newPos == -1: Fronimo.print_error(self.stFile, "Cannot find pattern 'CPiece'") return False self.offset = newPos + 14 title = self.latin2rtf(self.title) self._put(title) self.subtitle = self.subtitle.replace('\|', '\r\n') self.subtitle = self.latin2rtf(self.subtitle) self._put(self.subtitle) composer = self.latin2rtf(self.composer) self._put(composer) # Discard text at beginning and end of section self._getBstr() self._getBstr() info = self.info info = info.replace('|', '\r\n') self._put(info) return True # end depopulate # If from a template, prompts where to write it to. # If from an existing file, writes to that file # Maybe with a backup file, until all is known to be kosher def write_file(self, fPrompt): flOut = None btFron = bytes(self.stFron, 'latin1') if not fPrompt: stInput = self.stFile else: if self.stFile == TEMPLATE: stFlOut = "" else: stFlOut = self.stFile while flOut == None: if stFlOut: stInput = input("Write file to (%s): " % stFlOut) if stInput == "": stInput = stFlOut if stInput[-4:] != ".ft3": stInput += '.ft3' else: stInput = "" while stInput == "": stInput = input("Write file to: ") if stInput[-4:] != ".ft3": stInput += '.ft3' flOut = gzip.open(stInput, "wb") if flOut.write(btFron) != len(btFron): return False return True # end of write_file # end of class Fronimo def usage(): if Fronimo.currProg == 'dft.py': (print( "USAGE: dft.py [-j] [-o ] [[...]].")) elif Fronimo.currProg == 'cft.py': print("USAGE: cft [ ... ]") else: print("USAGE: eft: [...]") sys.exit(0) def do_eft(): dChanges = {} if len(sys.argv) == 0: usage() if re.search(r'.*\.json', sys.argv[0]): with open(sys.argv[0], encoding = "latin1") as jFile: dChanges = json.loads(jFile.read()) if dChanges == {}: print("Cannot load data from json file %s." % f) sys.exit(1) else: print("First argument must be a json data file.") usage() sys.argv.pop(0) if len(sys.argv) == 0: sys.argv += [TEMPLATE] # Same error file for all instances if not Fronimo.open_error(EERRFILE): sys.exit(2) # Open name- and type-related files Fronimo.open_files() # Update fronimo files, using data from json file for f in sys.argv: if re.search(r'.*\.ft3',f): fron = Fronimo(f) # means file not found if fron.start_offset <= 0: continue fron.load_changes(dChanges) if not fron.depopulate(): Fronimo.print_error(fron.stFile, "Cannot update fronimo file.") continue if not fron.write_file(True): Fronimo.print_error(fron.stFile, "Cannot write fronimo file.") continue else: print("Subsequent arguments, if any, must be fronimo files.") usage() # end do_eft # Walk the main directory tree def walk_tree(sDir): count = 0 sDir = sDir.replace("\\", '/') oldDir = "" Fronimo.print_headers() for root, dirs, files in os.walk(sDir): # Only when changing root directories do we collect a list of .ft3 files root = root.replace("\\", '/') if root == oldDir: continue # Now we have a new "old directory" oldDir = root # Won't find any .ft3 files in directory trees starting with BADDIRS # so don't process these for target in BADDIRS: result = root.find("/" + target) if result != -1: break if result != -1: continue # Get a list of .ft3 files in this directory ft3s = [f for f in files if f.endswith('.ft3')] # Process each .ft3 file in the list. for fname in ft3s: # collects info from fronimo file, formats it, and outputs it. # open and unzip fronimo file fron = Fronimo(root + '/' + fname) # means file not processed correctly if fron.start_offset <= 0: continue fron.write_data() count += 1 # end for fname in ft3s # end if root != oldDir # After walking the directories. if Fronimo.fJson: print(']', end= "", file=Fronimo.flJson) Fronimo.flJson.flush() else: Fronimo.flTsv.flush() return count # end walk_tree def get_args(lsDir): Fronimo.fJson = False fNewOut = False outFile = TSVFILE # outFile default values if len(lsDir) >= 1: if lsDir[0] == '-j': # We are creating a json file Fronimo.fJson = True Fronimo.base = LOCALBASE outFile = JSONFILE lsDir.pop(0) else: # we are creating a spreadsheet Fronimo.base = HTTPBASE # or output file specified on command line if len(lsDir) >= 1: if lsDir[0] == '-o': lsDir.pop(0) if len(lsDir) == 0: print("Missing output file.") usage() outFile = lsDir.pop(0) fNewOut = True if fNewOut: if Fronimo.fJson: if outFile[-5:] != '.json': outFile += '.json' else: if outFile[-4:] != '.tsv': outFile += '.tsv' if os.path.isdir(outFile): print("Output file %s is a directory." % outFile) usage() if Fronimo.fJson: # for creating a json file Fronimo.flJson = Fronimo.open_file(outFile, "w") Fronimo.base = LOCALBASE else: # for creating a spreadsheet Fronimo.flTsv = Fronimo.open_file(outFile, "w") Fronimo.base = HTTPBASE if len(lsDir) == 0: # current directory is default starting place lsDir = [os.getcwd()] else: # or starting director[ies] specified on the command line. for d in lsDir: if not os.path.isdir(d): print(d, " is not a directory.", file=sys.stderr) usage() return(lsDir) # End get_args def print_stats(sTime, count): endTime = int(time.time()) elapsedTime = endTime - sTime elapsedSeconds = elapsedTime % 3600 elapsedHours = int((elapsedTime - elapsedSeconds) / 3600) elapsedMinutes = elapsedSeconds / 60 elapsedSeconds = elapsedSeconds % 60 elapsedMinutes = int(elapsedMinutes - elapsedSeconds / 60) print(count, "fronimo files processed in ", elapsedHours, " hours, ", elapsedMinutes, "minutes, and ", elapsedSeconds, "secs.") print("", \ nameCount, "new/incorrect names\n",\ typeCount, "new/incorrect types\n",\ instCount, "new/incorrect instruments\n",\ partCount, "parts not in ensemble\n",\ facCount, "unmatched facsimile file references") print("Type a key to exit: ", end='') input() # end print_stats def do_dft(): Fronimo.currProg = 'dft.py' lsDirs = get_args(sys.argv) count = 0 startTime = int(time.time()) if not Fronimo.open_error(DERRFILE): sys.exit(3) # Open name- and type-related files Fronimo.open_files() # Get start directory from command line for d in lsDirs: count += walk_tree(d) print_stats(startTime, count) # end do_dft def do_cft(): if not Fronimo.open_error(CERRFILE): print("Cannot open error file.") sys.exit(4) count = 0 stInput = "" if len(sys.argv) == 0: stInput = input("Json file: ") if stInput == "": stInput = "dft.json" # default while stInput[-5:] != '.json': print("Not a json file.") stInput = input("Json file (dft.json): ") if stInput == "": sys.exit(0) if not os.path.exists(stInput): print("File %s does not exist" % stInput) continue count = Fronimo.correct_all(stInput) else: # Check to make sure all exist and are json files for f in sys.argv: if f[-5:] != '.json': print("%s is not a json file." % stInput) usage() for f in sys.argv: if not os.path.exists(f): print("File %s does not exist" % f) usage() count += Fronimo.correct_all(f) print_stats(startTime, count) #end do_cft # Prompt for comma separated list of types to search for # Return all types and sub-types def do_tft(): Fronimo.open_files() while True: typeList = input("Types requested:") if typeList == "": sys.exit(0) tpOut = Fronimo.get_type_list(typeList) if tpOut: show_tuple(tpOut) # end do_tft def main(): firstArg = sys.argv.pop(0) # Using find because we might be launching from another directory if firstArg.find('tft') != -1: do_tft() # Using find because we might be launching from another directory if firstArg.find('dft') != -1: Fronimo.currProg = 'dft.py' elif firstArg.find('cft') != -1: Fronimo.currProg = 'cft.py' else: Fronimo.currProg = 'eft.py' if Fronimo.currProg == 'dft.py': do_dft() elif len(sys.argv) == 0: # eft.py requires at least one argument (a json file) if Fronimo.currProg == 'eft.py': pdb.set_trace() for item in Fronimo.typeList: if len(item) == 2 and item[0] == item[1]: print(item[0]) usage() else: # cft.py doesn't do_cft() # cft.py and eft.py don't take these options elif sys.argv[0] == '-j' or sys.argv[0] == '-o': usage() elif Fronimo.currProg == 'eft.py': do_eft() else: do_cft() # end main if __name__ == "__main__": main()