#!PYTHONEXE """convertToMadrigal3.py [--numCPU=][-q --quiet][--skipCache][--skipMad3Download] converts a CEDAR 2.x database to Madrigal 3.0. Steps: 1. Runs createCachedHdf5Files.py with --overwrite --includeNonDefault to make sure every Hdf5 file in overview is up-to-date and ready to be switched to be the main file. If --numCPU given, that argument passed to createCachedHdf5Files.py 2. For each experiment, and then for each non-Hdf5 file in that experiment: 3. Remove that file, and copy it to /deprecated 4. Move the corresponding Hdf5 file from overview into experiment, adding it with the same status and category as removed file. If that file does not exist (because createCachedHdf5Files.py failed to create it), log that to stdout. 3. Updates siteTab.txt table to add Madrigal version field to end (3.0) $Id: convertToMadrigal3.py 7119 2020-06-22 20:28:21Z brideout $ """ # standard python imports import os, os.path, sys import subprocess import shutil import glob import re import getopt import time import traceback import multiprocessing import random # madrigal imports import madrigal.metadata import madrigal.admin import madrigal.data def convertRecordPlots(plotDir): """convertRecordPlots converts all record plots to png with correct numbering """ gsCmdTemplate = 'gs -dNOPAUSE -dBATCH -q -sDEVICE=png256 -r120 -g1230x1230 -sOutputFile=%s %s' convertCmdTemplate = 'convert %s %s' reStr = '[0-9][0-9][0-9][0-9][0-9]' pngFiles = glob.glob(os.path.join(plotDir, '*[0-9][0-9][0-9][0-9][0-9]*.png')) pngFiles.sort() if len(pngFiles) > 0 and pngFiles[0].find('00000') != -1: # already converted return # loop through possible file types image_exts_dict = {'png': 'convert', 'jpg':'convert', 'jpeg':'convert', 'gif': 'convert', 'eps': 'gs', 'ps': 'gp'} for image_ext in list(image_exts_dict.keys()): image_ext_type = image_exts_dict[image_ext] plotFiles = glob.glob(os.path.join(plotDir, '*[0-9][0-9][0-9][0-9][0-9]*.%s' % (image_ext))) if len(plotFiles)> 0: print(('converting *.%s files in %s' % (image_ext, plotDir))) plotFiles.sort() for i, plotFile in enumerate(plotFiles): basename = os.path.basename(plotFile) items = re.split(reStr, basename) targetBasename = items[0] + '%05i' % (i) + items[1] targetBasename = targetBasename[:targetBasename.rfind('.')+1] + 'png' target = os.path.join(plotDir, targetBasename) if image_ext_type == 'gs': cmd = gsCmdTemplate % (target, plotFile) elif image_ext_type == 'convert': cmd = convertCmdTemplate % (plotFile, target) try: subprocess.check_call(cmd.split()) except: print(('cmd <%s> failed' % (cmd))) traceback.print_exc() # remove old file os.remove(plotFile) break def processExperiment(args): """ method called by multiprocesing pool to handle a single experiment args = expDir, madFile, problemExpList, quiet """ # put the entire method in a try block to get full traceback try: t = time.time() madDB = madrigal.metadata.MadrigalDB() expDir, problemExpQueue, quiet = args madFile = madrigal.metadata.MadrigalMetaFile(madDB, os.path.join(expDir, 'fileTab.txt')) deprecatedDir = os.path.join(expDir, 'deprecated') if not quiet: print(('working on dir %s' % (expDir))) try: os.mkdir(deprecatedDir) except: pass # the first pass is to get a list of all files already registered, in case of a partial conversion # also makes sure file times are all registered filesInExp = [] modified = False for j in range(madFile.getFileCount()): filesInExp.append(madFile.getFilenameByPosition(j)) if madFile.getFileDatetimeByPosition(j) is None: madFile.setFileDatetimeByPosition(j, None) modified = True # so we know we need to rewrite it if modified: madFile.writeMetadata() if not quiet: for s in filesInExp: print(s) filesToRemove = [] # remove files only after adding files for j in range(madFile.getFileCount()): filename = madFile.getFilenameByPosition(j) fileName, fileExtension = os.path.splitext(filename) overviewFile = os.path.join(expDir, 'overview', filename + '.summary') if fileExtension in ('.h5', '.hdf5', '.hdf'): if not os.access(overviewFile, os.R_OK): if not quiet: print(('adding overview to %s' % (filename))) madrigal.data.MadrigalFile(os.path.join(expDir, filename), madDB, acceptOldSummary=True) if not quiet: print(('skipping %s because already Hdf5' % (filename))) continue if not quiet: print(('working on file %s' % (filename))) if filename + '.hdf5' not in filesInExp: status = madFile.getStatusByPosition(j) category = madFile.getCategoryByPosition(j) try: shutil.copy(os.path.join(expDir, filename), deprecatedDir) except: print(('Mad2 file %s not found! Adding this exp to problem list' % (os.path.join(expDir, filename)))) problemExpQueue.put(expDir) continue hdf5File = os.path.join(expDir, 'overview', filename + '.hdf5') if not os.access(hdf5File, os.R_OK): print(('Hdf5 file %s not found! Adding this exp to problem list' % (hdf5File))) problemExpQueue.put(expDir) continue try: # load Hdf5 first before deleting original file madDataObj = madrigal.data.MadrigalFile(hdf5File, madDB, saveSummary=False) kindats = madDataObj.getKindatList() madAdmin.addMadrigalFile(expDir, hdf5File, 0, status, category, kindat=kindats[0], acceptOldSummary=True, notify=False) filesToRemove.append(filename) # now make sure summary file updated newMadFile = os.path.join(expDir, os.path.basename(hdf5File)) madNewDataObj = madrigal.data.MadrigalFile(newMadFile, madDB) if not quiet: print(('%s added' % (hdf5File))) except: print(('problem with file %s' % (hdf5File))) traceback.print_exc() problemExpQueue.put(expDir) continue # check if record plots exists recordsDir = os.path.join(expDir, 'plots', filename, 'records') orgCount = glob.glob(os.path.join(recordsDir, '*.*')) newRecordsDir = os.path.join(expDir, 'plots', filename+'.hdf5', 'records') newCount = glob.glob(os.path.join(newRecordsDir, '*.*')) if os.access(recordsDir, os.R_OK) and orgCount > newCount: if not quiet: print(('converting record plots in %s' % (recordsDir))) convertRecordPlots(recordsDir) oldFilenameDir = os.path.join(expDir, 'plots', filename) newFilenameDir = os.path.join(expDir, 'plots', filename + '.hdf5') # just to be sure its doesn't exist cmd = 'rm -rf %s' % (newFilenameDir) subprocess.check_call(cmd.split()) shutil.move(oldFilenameDir, newFilenameDir) else: if not quiet: print(('simply deleting %s because already exists as hdf5' % (filename))) filesToRemove.append(filename) for filename in filesToRemove: try: madAdmin.removeMadrigalFile(expDir, filename, allowMissing=True) if not quiet: print(('removed %s' % (filename))) except: traceback.print_exc() target = os.path.join(expDir, filename) if not quiet: print(('Will remove the file %s manually if possible' % (target))) try: os.remove(target) except: pass if not quiet: print(('Processing expDir %s took %f seconds' % (expDir, time.time()-t))) except Exception as e: print(('Exception raised with expDir %s and file %s' % (str(expDir), str(madFile)))) traceback.print_exc() ### main script begins here ### if __name__ == '__main__': t1 = time.time() numCPUCmd = '' numCPU = multiprocessing.cpu_count()-2 if numCPU < 1: numCPU = 1 if numCPU > 2: numCPU = 2 quiet = False skipCache = False skipMad3Download = False try: opts, args = getopt.getopt(sys.argv[1:], "q", ["numCPU=", "quiet", "skipCache", "skipMad3Download"]) except getopt.GetoptError as err: print(str(err)) sys.exit(2) for o, a in opts: if o == '--numCPU': numCPU = min(int(a), numCPU) if numCPU < 1: raise ValueError('numCPU must be positive, not %i' % (numCPU)) numCPUCmd = '--numCPU=%i' % (numCPU) elif o == '-q' or o == '--quiet': quiet = True elif o == '--skipCache': skipCache = True elif o == '--skipMad3Download': skipMad3Download = True else: assert False, "unhandled option" madDB = madrigal.metadata.MadrigalDB() madAdmin = madrigal.admin.MadrigalDBAdmin(madDB) if skipMad3Download: skipDownloadStr = '--skipMad3Download' else: skipDownloadStr = '' cmd = '%s/bin/createCachedHdf5Files.py --includeNonDefault --mad3 %s --includeGeo %s' % (madDB.getMadroot(), skipDownloadStr, numCPUCmd) if not skipCache: subprocess.check_call(cmd.split()) m = multiprocessing.Manager() problemExpQueue = m.Queue() pool = multiprocessing.Pool(processes=numCPU) expsToProcess = [] # argument expDir, to processExp madExp = madrigal.metadata.MadrigalExperiment(madDB) for i in range(madExp.getExpCount()): expDir = madExp.getExpDirByPosition(i) try: madFile = madrigal.metadata.MadrigalMetaFile(madDB, os.path.join(expDir, 'fileTab.txt')) except: print(('no files in experiments %s' % (expDir))) continue expsToProcess.append((expDir, problemExpQueue, quiet)) random.shuffle(expsToProcess) pool.map(processExperiment, expsToProcess, 5) print('pooled calls to processExperiment done - calling updateMaster') madAdmin.updateMaster() print('Set site version to 3.0 if needed') mdSiteObj = madrigal.metadata.MadrigalSite(madDB) siteID = madDB.getSiteID() if siteID != '3.0': mdSiteObj.setSiteVersionBySiteID(siteID, '3.0') mdSiteObj.writeMetadata() if problemExpQueue.qsize() == 0: print('Conversion without problems') else: print('The following experiments had missing Hdf5 files') for i in range(problemExpQueue.qsize()): print((problemExpQueue.get())) print(('Total time to run convertToMadrigal %f secs' % (time.time() - t1)))