##// END OF EJS Templates
Add BASE_URL in settings to work with proxys
Add BASE_URL in settings to work with proxys

File last commit:

r0:b84e1135c2c4
r18:5a8055e18e7b
Show More
convertToMadrigal3.py
296 lines | 12.2 KiB | text/x-python | PythonLexer
/ source / madpy / scripts / bin / convertToMadrigal3.py
#!PYTHONEXE
"""convertToMadrigal3.py [--numCPU=<numCPU>][-q --quiet][--skipCache][--skipMad3Download] converts a CEDAR 2.x database to Madrigal 3.0.
Steps:
1. Runs createCachedHdf5Files.py with --overwrite --includeNonDefault to make sure every Hdf5 file in overview is up-to-date
and ready to be switched to be the main file. If --numCPU given, that argument passed to createCachedHdf5Files.py
2. For each experiment, and then for each non-Hdf5 file in that experiment:
3. Remove that file, and copy it to <expDir>/deprecated
4. Move the corresponding Hdf5 file from overview into experiment, adding it with the same status and category as
removed file. If that file does not exist (because createCachedHdf5Files.py failed to create it), log that to stdout.
3. Updates siteTab.txt table to add Madrigal version field to end (3.0)
$Id: convertToMadrigal3.py 7119 2020-06-22 20:28:21Z brideout $
"""
# standard python imports
import os, os.path, sys
import subprocess
import shutil
import glob
import re
import getopt
import time
import traceback
import multiprocessing
import random
# madrigal imports
import madrigal.metadata
import madrigal.admin
import madrigal.data
def convertRecordPlots(plotDir):
"""convertRecordPlots converts all record plots to png with correct numbering
"""
gsCmdTemplate = 'gs -dNOPAUSE -dBATCH -q -sDEVICE=png256 -r120 -g1230x1230 -sOutputFile=%s %s'
convertCmdTemplate = 'convert %s %s'
reStr = '[0-9][0-9][0-9][0-9][0-9]'
pngFiles = glob.glob(os.path.join(plotDir, '*[0-9][0-9][0-9][0-9][0-9]*.png'))
pngFiles.sort()
if len(pngFiles) > 0 and pngFiles[0].find('00000') != -1:
# already converted
return
# loop through possible file types
image_exts_dict = {'png': 'convert',
'jpg':'convert',
'jpeg':'convert',
'gif': 'convert',
'eps': 'gs',
'ps': 'gp'}
for image_ext in list(image_exts_dict.keys()):
image_ext_type = image_exts_dict[image_ext]
plotFiles = glob.glob(os.path.join(plotDir, '*[0-9][0-9][0-9][0-9][0-9]*.%s' % (image_ext)))
if len(plotFiles)> 0:
print(('converting *.%s files in %s' % (image_ext, plotDir)))
plotFiles.sort()
for i, plotFile in enumerate(plotFiles):
basename = os.path.basename(plotFile)
items = re.split(reStr, basename)
targetBasename = items[0] + '%05i' % (i) + items[1]
targetBasename = targetBasename[:targetBasename.rfind('.')+1] + 'png'
target = os.path.join(plotDir, targetBasename)
if image_ext_type == 'gs':
cmd = gsCmdTemplate % (target, plotFile)
elif image_ext_type == 'convert':
cmd = convertCmdTemplate % (plotFile, target)
try:
subprocess.check_call(cmd.split())
except:
print(('cmd <%s> failed' % (cmd)))
traceback.print_exc()
# remove old file
os.remove(plotFile)
break
def processExperiment(args):
""" method called by multiprocesing pool to handle a single experiment
args = expDir, madFile, problemExpList, quiet
"""
# put the entire method in a try block to get full traceback
try:
t = time.time()
madDB = madrigal.metadata.MadrigalDB()
expDir, problemExpQueue, quiet = args
madFile = madrigal.metadata.MadrigalMetaFile(madDB, os.path.join(expDir, 'fileTab.txt'))
deprecatedDir = os.path.join(expDir, 'deprecated')
if not quiet:
print(('working on dir %s' % (expDir)))
try:
os.mkdir(deprecatedDir)
except:
pass
# the first pass is to get a list of all files already registered, in case of a partial conversion
# also makes sure file times are all registered
filesInExp = []
modified = False
for j in range(madFile.getFileCount()):
filesInExp.append(madFile.getFilenameByPosition(j))
if madFile.getFileDatetimeByPosition(j) is None:
madFile.setFileDatetimeByPosition(j, None)
modified = True # so we know we need to rewrite it
if modified:
madFile.writeMetadata()
if not quiet:
for s in filesInExp:
print(s)
filesToRemove = [] # remove files only after adding files
for j in range(madFile.getFileCount()):
filename = madFile.getFilenameByPosition(j)
fileName, fileExtension = os.path.splitext(filename)
overviewFile = os.path.join(expDir, 'overview', filename + '.summary')
if fileExtension in ('.h5', '.hdf5', '.hdf'):
if not os.access(overviewFile, os.R_OK):
if not quiet:
print(('adding overview to %s' % (filename)))
madrigal.data.MadrigalFile(os.path.join(expDir, filename), madDB, acceptOldSummary=True)
if not quiet:
print(('skipping %s because already Hdf5' % (filename)))
continue
if not quiet:
print(('working on file %s' % (filename)))
if filename + '.hdf5' not in filesInExp:
status = madFile.getStatusByPosition(j)
category = madFile.getCategoryByPosition(j)
try:
shutil.copy(os.path.join(expDir, filename), deprecatedDir)
except:
print(('Mad2 file %s not found! Adding this exp to problem list' % (os.path.join(expDir, filename))))
problemExpQueue.put(expDir)
continue
hdf5File = os.path.join(expDir, 'overview', filename + '.hdf5')
if not os.access(hdf5File, os.R_OK):
print(('Hdf5 file %s not found! Adding this exp to problem list' % (hdf5File)))
problemExpQueue.put(expDir)
continue
try:
# load Hdf5 first before deleting original file
madDataObj = madrigal.data.MadrigalFile(hdf5File, madDB, saveSummary=False)
kindats = madDataObj.getKindatList()
madAdmin.addMadrigalFile(expDir, hdf5File, 0, status, category, kindat=kindats[0],
acceptOldSummary=True, notify=False)
filesToRemove.append(filename)
# now make sure summary file updated
newMadFile = os.path.join(expDir, os.path.basename(hdf5File))
madNewDataObj = madrigal.data.MadrigalFile(newMadFile, madDB)
if not quiet:
print(('%s added' % (hdf5File)))
except:
print(('problem with file %s' % (hdf5File)))
traceback.print_exc()
problemExpQueue.put(expDir)
continue
# check if record plots exists
recordsDir = os.path.join(expDir, 'plots', filename, 'records')
orgCount = glob.glob(os.path.join(recordsDir, '*.*'))
newRecordsDir = os.path.join(expDir, 'plots', filename+'.hdf5', 'records')
newCount = glob.glob(os.path.join(newRecordsDir, '*.*'))
if os.access(recordsDir, os.R_OK) and orgCount > newCount:
if not quiet:
print(('converting record plots in %s' % (recordsDir)))
convertRecordPlots(recordsDir)
oldFilenameDir = os.path.join(expDir, 'plots', filename)
newFilenameDir = os.path.join(expDir, 'plots', filename + '.hdf5')
# just to be sure its doesn't exist
cmd = 'rm -rf %s' % (newFilenameDir)
subprocess.check_call(cmd.split())
shutil.move(oldFilenameDir, newFilenameDir)
else:
if not quiet:
print(('simply deleting %s because already exists as hdf5' % (filename)))
filesToRemove.append(filename)
for filename in filesToRemove:
try:
madAdmin.removeMadrigalFile(expDir, filename, allowMissing=True)
if not quiet:
print(('removed %s' % (filename)))
except:
traceback.print_exc()
target = os.path.join(expDir, filename)
if not quiet:
print(('Will remove the file %s manually if possible' % (target)))
try:
os.remove(target)
except:
pass
if not quiet:
print(('Processing expDir %s took %f seconds' % (expDir, time.time()-t)))
except Exception as e:
print(('Exception raised with expDir %s and file %s' % (str(expDir), str(madFile))))
traceback.print_exc()
### main script begins here ###
if __name__ == '__main__':
t1 = time.time()
numCPUCmd = ''
numCPU = multiprocessing.cpu_count()-2
if numCPU < 1:
numCPU = 1
if numCPU > 2:
numCPU = 2
quiet = False
skipCache = False
skipMad3Download = False
try:
opts, args = getopt.getopt(sys.argv[1:], "q", ["numCPU=", "quiet", "skipCache", "skipMad3Download"])
except getopt.GetoptError as err:
print(str(err))
sys.exit(2)
for o, a in opts:
if o == '--numCPU':
numCPU = min(int(a), numCPU)
if numCPU < 1:
raise ValueError('numCPU must be positive, not %i' % (numCPU))
numCPUCmd = '--numCPU=%i' % (numCPU)
elif o == '-q' or o == '--quiet':
quiet = True
elif o == '--skipCache':
skipCache = True
elif o == '--skipMad3Download':
skipMad3Download = True
else:
assert False, "unhandled option"
madDB = madrigal.metadata.MadrigalDB()
madAdmin = madrigal.admin.MadrigalDBAdmin(madDB)
if skipMad3Download:
skipDownloadStr = '--skipMad3Download'
else:
skipDownloadStr = ''
cmd = '%s/bin/createCachedHdf5Files.py --includeNonDefault --mad3 %s --includeGeo %s' % (madDB.getMadroot(),
skipDownloadStr, numCPUCmd)
if not skipCache:
subprocess.check_call(cmd.split())
m = multiprocessing.Manager()
problemExpQueue = m.Queue()
pool = multiprocessing.Pool(processes=numCPU)
expsToProcess = [] # argument expDir, to processExp
madExp = madrigal.metadata.MadrigalExperiment(madDB)
for i in range(madExp.getExpCount()):
expDir = madExp.getExpDirByPosition(i)
try:
madFile = madrigal.metadata.MadrigalMetaFile(madDB, os.path.join(expDir, 'fileTab.txt'))
except:
print(('no files in experiments %s' % (expDir)))
continue
expsToProcess.append((expDir, problemExpQueue, quiet))
random.shuffle(expsToProcess)
pool.map(processExperiment, expsToProcess, 5)
print('pooled calls to processExperiment done - calling updateMaster')
madAdmin.updateMaster()
print('Set site version to 3.0 if needed')
mdSiteObj = madrigal.metadata.MadrigalSite(madDB)
siteID = madDB.getSiteID()
if siteID != '3.0':
mdSiteObj.setSiteVersionBySiteID(siteID, '3.0')
mdSiteObj.writeMetadata()
if problemExpQueue.qsize() == 0:
print('Conversion without problems')
else:
print('The following experiments had missing Hdf5 files')
for i in range(problemExpQueue.qsize()):
print((problemExpQueue.get()))
print(('Total time to run convertToMadrigal %f secs' % (time.time() - t1)))