convertToMadrigal3.py
296 lines
| 12.2 KiB
| text/x-python
|
PythonLexer
r0 | #!PYTHONEXE | |||
"""convertToMadrigal3.py [--numCPU=<numCPU>][-q --quiet][--skipCache][--skipMad3Download] converts a CEDAR 2.x database to Madrigal 3.0. | ||||
Steps: | ||||
1. Runs createCachedHdf5Files.py with --overwrite --includeNonDefault to make sure every Hdf5 file in overview is up-to-date | ||||
and ready to be switched to be the main file. If --numCPU given, that argument passed to createCachedHdf5Files.py | ||||
2. For each experiment, and then for each non-Hdf5 file in that experiment: | ||||
3. Remove that file, and copy it to <expDir>/deprecated | ||||
4. Move the corresponding Hdf5 file from overview into experiment, adding it with the same status and category as | ||||
removed file. If that file does not exist (because createCachedHdf5Files.py failed to create it), log that to stdout. | ||||
3. Updates siteTab.txt table to add Madrigal version field to end (3.0) | ||||
$Id: convertToMadrigal3.py 7119 2020-06-22 20:28:21Z brideout $ | ||||
""" | ||||
# standard python imports | ||||
import os, os.path, sys | ||||
import subprocess | ||||
import shutil | ||||
import glob | ||||
import re | ||||
import getopt | ||||
import time | ||||
import traceback | ||||
import multiprocessing | ||||
import random | ||||
# madrigal imports | ||||
import madrigal.metadata | ||||
import madrigal.admin | ||||
import madrigal.data | ||||
def convertRecordPlots(plotDir): | ||||
"""convertRecordPlots converts all record plots to png with correct numbering | ||||
""" | ||||
gsCmdTemplate = 'gs -dNOPAUSE -dBATCH -q -sDEVICE=png256 -r120 -g1230x1230 -sOutputFile=%s %s' | ||||
convertCmdTemplate = 'convert %s %s' | ||||
reStr = '[0-9][0-9][0-9][0-9][0-9]' | ||||
pngFiles = glob.glob(os.path.join(plotDir, '*[0-9][0-9][0-9][0-9][0-9]*.png')) | ||||
pngFiles.sort() | ||||
if len(pngFiles) > 0 and pngFiles[0].find('00000') != -1: | ||||
# already converted | ||||
return | ||||
# loop through possible file types | ||||
image_exts_dict = {'png': 'convert', | ||||
'jpg':'convert', | ||||
'jpeg':'convert', | ||||
'gif': 'convert', | ||||
'eps': 'gs', | ||||
'ps': 'gp'} | ||||
for image_ext in list(image_exts_dict.keys()): | ||||
image_ext_type = image_exts_dict[image_ext] | ||||
plotFiles = glob.glob(os.path.join(plotDir, '*[0-9][0-9][0-9][0-9][0-9]*.%s' % (image_ext))) | ||||
if len(plotFiles)> 0: | ||||
print(('converting *.%s files in %s' % (image_ext, plotDir))) | ||||
plotFiles.sort() | ||||
for i, plotFile in enumerate(plotFiles): | ||||
basename = os.path.basename(plotFile) | ||||
items = re.split(reStr, basename) | ||||
targetBasename = items[0] + '%05i' % (i) + items[1] | ||||
targetBasename = targetBasename[:targetBasename.rfind('.')+1] + 'png' | ||||
target = os.path.join(plotDir, targetBasename) | ||||
if image_ext_type == 'gs': | ||||
cmd = gsCmdTemplate % (target, plotFile) | ||||
elif image_ext_type == 'convert': | ||||
cmd = convertCmdTemplate % (plotFile, target) | ||||
try: | ||||
subprocess.check_call(cmd.split()) | ||||
except: | ||||
print(('cmd <%s> failed' % (cmd))) | ||||
traceback.print_exc() | ||||
# remove old file | ||||
os.remove(plotFile) | ||||
break | ||||
def processExperiment(args): | ||||
""" method called by multiprocesing pool to handle a single experiment | ||||
args = expDir, madFile, problemExpList, quiet | ||||
""" | ||||
# put the entire method in a try block to get full traceback | ||||
try: | ||||
t = time.time() | ||||
madDB = madrigal.metadata.MadrigalDB() | ||||
expDir, problemExpQueue, quiet = args | ||||
madFile = madrigal.metadata.MadrigalMetaFile(madDB, os.path.join(expDir, 'fileTab.txt')) | ||||
deprecatedDir = os.path.join(expDir, 'deprecated') | ||||
if not quiet: | ||||
print(('working on dir %s' % (expDir))) | ||||
try: | ||||
os.mkdir(deprecatedDir) | ||||
except: | ||||
pass | ||||
# the first pass is to get a list of all files already registered, in case of a partial conversion | ||||
# also makes sure file times are all registered | ||||
filesInExp = [] | ||||
modified = False | ||||
for j in range(madFile.getFileCount()): | ||||
filesInExp.append(madFile.getFilenameByPosition(j)) | ||||
if madFile.getFileDatetimeByPosition(j) is None: | ||||
madFile.setFileDatetimeByPosition(j, None) | ||||
modified = True # so we know we need to rewrite it | ||||
if modified: | ||||
madFile.writeMetadata() | ||||
if not quiet: | ||||
for s in filesInExp: | ||||
print(s) | ||||
filesToRemove = [] # remove files only after adding files | ||||
for j in range(madFile.getFileCount()): | ||||
filename = madFile.getFilenameByPosition(j) | ||||
fileName, fileExtension = os.path.splitext(filename) | ||||
overviewFile = os.path.join(expDir, 'overview', filename + '.summary') | ||||
if fileExtension in ('.h5', '.hdf5', '.hdf'): | ||||
if not os.access(overviewFile, os.R_OK): | ||||
if not quiet: | ||||
print(('adding overview to %s' % (filename))) | ||||
madrigal.data.MadrigalFile(os.path.join(expDir, filename), madDB, acceptOldSummary=True) | ||||
if not quiet: | ||||
print(('skipping %s because already Hdf5' % (filename))) | ||||
continue | ||||
if not quiet: | ||||
print(('working on file %s' % (filename))) | ||||
if filename + '.hdf5' not in filesInExp: | ||||
status = madFile.getStatusByPosition(j) | ||||
category = madFile.getCategoryByPosition(j) | ||||
try: | ||||
shutil.copy(os.path.join(expDir, filename), deprecatedDir) | ||||
except: | ||||
print(('Mad2 file %s not found! Adding this exp to problem list' % (os.path.join(expDir, filename)))) | ||||
problemExpQueue.put(expDir) | ||||
continue | ||||
hdf5File = os.path.join(expDir, 'overview', filename + '.hdf5') | ||||
if not os.access(hdf5File, os.R_OK): | ||||
print(('Hdf5 file %s not found! Adding this exp to problem list' % (hdf5File))) | ||||
problemExpQueue.put(expDir) | ||||
continue | ||||
try: | ||||
# load Hdf5 first before deleting original file | ||||
madDataObj = madrigal.data.MadrigalFile(hdf5File, madDB, saveSummary=False) | ||||
kindats = madDataObj.getKindatList() | ||||
madAdmin.addMadrigalFile(expDir, hdf5File, 0, status, category, kindat=kindats[0], | ||||
acceptOldSummary=True, notify=False) | ||||
filesToRemove.append(filename) | ||||
# now make sure summary file updated | ||||
newMadFile = os.path.join(expDir, os.path.basename(hdf5File)) | ||||
madNewDataObj = madrigal.data.MadrigalFile(newMadFile, madDB) | ||||
if not quiet: | ||||
print(('%s added' % (hdf5File))) | ||||
except: | ||||
print(('problem with file %s' % (hdf5File))) | ||||
traceback.print_exc() | ||||
problemExpQueue.put(expDir) | ||||
continue | ||||
# check if record plots exists | ||||
recordsDir = os.path.join(expDir, 'plots', filename, 'records') | ||||
orgCount = glob.glob(os.path.join(recordsDir, '*.*')) | ||||
newRecordsDir = os.path.join(expDir, 'plots', filename+'.hdf5', 'records') | ||||
newCount = glob.glob(os.path.join(newRecordsDir, '*.*')) | ||||
if os.access(recordsDir, os.R_OK) and orgCount > newCount: | ||||
if not quiet: | ||||
print(('converting record plots in %s' % (recordsDir))) | ||||
convertRecordPlots(recordsDir) | ||||
oldFilenameDir = os.path.join(expDir, 'plots', filename) | ||||
newFilenameDir = os.path.join(expDir, 'plots', filename + '.hdf5') | ||||
# just to be sure its doesn't exist | ||||
cmd = 'rm -rf %s' % (newFilenameDir) | ||||
subprocess.check_call(cmd.split()) | ||||
shutil.move(oldFilenameDir, newFilenameDir) | ||||
else: | ||||
if not quiet: | ||||
print(('simply deleting %s because already exists as hdf5' % (filename))) | ||||
filesToRemove.append(filename) | ||||
for filename in filesToRemove: | ||||
try: | ||||
madAdmin.removeMadrigalFile(expDir, filename, allowMissing=True) | ||||
if not quiet: | ||||
print(('removed %s' % (filename))) | ||||
except: | ||||
traceback.print_exc() | ||||
target = os.path.join(expDir, filename) | ||||
if not quiet: | ||||
print(('Will remove the file %s manually if possible' % (target))) | ||||
try: | ||||
os.remove(target) | ||||
except: | ||||
pass | ||||
if not quiet: | ||||
print(('Processing expDir %s took %f seconds' % (expDir, time.time()-t))) | ||||
except Exception as e: | ||||
print(('Exception raised with expDir %s and file %s' % (str(expDir), str(madFile)))) | ||||
traceback.print_exc() | ||||
### main script begins here ### | ||||
if __name__ == '__main__': | ||||
t1 = time.time() | ||||
numCPUCmd = '' | ||||
numCPU = multiprocessing.cpu_count()-2 | ||||
if numCPU < 1: | ||||
numCPU = 1 | ||||
if numCPU > 2: | ||||
numCPU = 2 | ||||
quiet = False | ||||
skipCache = False | ||||
skipMad3Download = False | ||||
try: | ||||
opts, args = getopt.getopt(sys.argv[1:], "q", ["numCPU=", "quiet", "skipCache", "skipMad3Download"]) | ||||
except getopt.GetoptError as err: | ||||
print(str(err)) | ||||
sys.exit(2) | ||||
for o, a in opts: | ||||
if o == '--numCPU': | ||||
numCPU = min(int(a), numCPU) | ||||
if numCPU < 1: | ||||
raise ValueError('numCPU must be positive, not %i' % (numCPU)) | ||||
numCPUCmd = '--numCPU=%i' % (numCPU) | ||||
elif o == '-q' or o == '--quiet': | ||||
quiet = True | ||||
elif o == '--skipCache': | ||||
skipCache = True | ||||
elif o == '--skipMad3Download': | ||||
skipMad3Download = True | ||||
else: | ||||
assert False, "unhandled option" | ||||
madDB = madrigal.metadata.MadrigalDB() | ||||
madAdmin = madrigal.admin.MadrigalDBAdmin(madDB) | ||||
if skipMad3Download: | ||||
skipDownloadStr = '--skipMad3Download' | ||||
else: | ||||
skipDownloadStr = '' | ||||
cmd = '%s/bin/createCachedHdf5Files.py --includeNonDefault --mad3 %s --includeGeo %s' % (madDB.getMadroot(), | ||||
skipDownloadStr, numCPUCmd) | ||||
if not skipCache: | ||||
subprocess.check_call(cmd.split()) | ||||
m = multiprocessing.Manager() | ||||
problemExpQueue = m.Queue() | ||||
pool = multiprocessing.Pool(processes=numCPU) | ||||
expsToProcess = [] # argument expDir, to processExp | ||||
madExp = madrigal.metadata.MadrigalExperiment(madDB) | ||||
for i in range(madExp.getExpCount()): | ||||
expDir = madExp.getExpDirByPosition(i) | ||||
try: | ||||
madFile = madrigal.metadata.MadrigalMetaFile(madDB, os.path.join(expDir, 'fileTab.txt')) | ||||
except: | ||||
print(('no files in experiments %s' % (expDir))) | ||||
continue | ||||
expsToProcess.append((expDir, problemExpQueue, quiet)) | ||||
random.shuffle(expsToProcess) | ||||
pool.map(processExperiment, expsToProcess, 5) | ||||
print('pooled calls to processExperiment done - calling updateMaster') | ||||
madAdmin.updateMaster() | ||||
print('Set site version to 3.0 if needed') | ||||
mdSiteObj = madrigal.metadata.MadrigalSite(madDB) | ||||
siteID = madDB.getSiteID() | ||||
if siteID != '3.0': | ||||
mdSiteObj.setSiteVersionBySiteID(siteID, '3.0') | ||||
mdSiteObj.writeMetadata() | ||||
if problemExpQueue.qsize() == 0: | ||||
print('Conversion without problems') | ||||
else: | ||||
print('The following experiments had missing Hdf5 files') | ||||
for i in range(problemExpQueue.qsize()): | ||||
print((problemExpQueue.get())) | ||||
print(('Total time to run convertToMadrigal %f secs' % (time.time() - t1))) | ||||