##// END OF EJS Templates
Add BASE_URL in settings to work with proxys
Add BASE_URL in settings to work with proxys

File last commit:

r0:b84e1135c2c4
r18:5a8055e18e7b
Show More
createCachedHdf5Files.py
320 lines | 13.0 KiB | text/x-python | PythonLexer
/ source / madpy / scripts / bin / createCachedHdf5Files.py
#!PYTHONEXE
"""createCachedHdf5Files.py is a script that will walk all the files in a Madrigal database, and make sure all
files in the old Cedar 2.X format have cached hdf5 versions. Used only when updating from Madrigal 3 to
Madrigal 3.
$Id: createCachedHdf5Files.py 7119 2020-06-22 20:28:21Z brideout $
"""
usage = """createCachedHdf5Files.py [--inst=<instList> --path=<expPath> --includeNonDefault --ini=<iniFile> --mad3 --overwrite
--includeGeo --numCPU=<numCPU> -h --help --removeSummary --skipMad3Download]
By default all instruments will be included. Use --inst=<comma delimited kinst list> to only include some instruments.
By default, all experiment directories will be included. Use --path to limit to a particular directory and all subdirectories.
By default only default files will be cached. Use --includeNonDefault to include all files.
By default, extra parameters and formats are added by the ini file $MADROOT/cachedFiles.ini. Use
--ini=<iniFile> to specify an alternative ini file. See madrigal.data.MadrigalFile._parseCachedIni for description of
the ini file format. Set --includeGeo to also convert geophysical files
Use --overwrite to overwrite all Hdf5 cached files
Use --mad3 to overwrite all non Madrigal3 Hdf5 files
numCPU by default is the maximum of (1, numCPUs available - 2). Use --numCPU to override, but still will not be highter than default.
Use --removeSummary to remove summary files before creating Hdf5 files
Use --skipMad3Download to not try to dowload file from madrigal3.haystack.mit.edu
-h or --help - print usage and exit
"""
import os, os.path, sys
import getopt
import time, datetime
import traceback
import multiprocessing
import warnings
import random
import h5py
import madrigal.metadata
import madrigal.data
import madrigalWeb.madrigalWeb
def downloadMad3File(filename, mad3Url, expDirNum):
"""downloadMad3File downloads the appropriate Madrigal 3 Hdf5 associated with filename if possible
from mad3Url. Also downloads the summary file.
Inputs:
filename - full path to filename on present Madrigal2 server
mad3Url - url of Madrigal 3 CEDAR Madrigal server to get hdf5 version from
expDirNum - either '' or '3' - suggestion as to what experiment directory to try first.
Returns True if success, False if not.
"""
user_fullname = 'Bill Rideout'
user_email = 'brideout@haystack.mit.edu'
user_affiliation = 'MIT'
format='hdf5'
madroot = '/opt/madrigal3'
madWebObj = madrigalWeb.madrigalWeb.MadrigalData(mad3Url)
expDir = os.path.dirname(filename)
basename = os.path.basename(filename)
# take into account that CEDAR madrigal server has both experiments and experiments3
remoteFileList = [os.path.join(madroot, filename[filename.find('experiments'):]) + '.hdf5']
remoteFileList.append(remoteFileList[-1].replace('experiments/', 'experiments3/'))
remoteSummaryFileList = [os.path.join(madroot, expDir[expDir.find('experiments'):], 'overview', basename + '.hdf5.summary')]
remoteSummaryFileList.append(remoteSummaryFileList[-1].replace('experiments/', 'experiments3/'))
if expDirNum == '3':
# reverse both lists to try experiments3 first
remoteFileList.reverse()
remoteSummaryFileList.reverse()
destDir = os.path.join(os.path.dirname(filename), 'overview')
destination = os.path.join(destDir, os.path.basename(filename) + '.hdf5')
summDest = destination + '.summary'
for i in range(len(remoteFileList)):
remoteFile = remoteFileList[i]
remoteSummaryFile = remoteSummaryFileList[i]
try:
madWebObj.downloadFile(remoteFile, destination, user_fullname, user_email, user_affiliation,
format)
madWebObj.downloadFile(remoteSummaryFile, summDest, user_fullname, user_email, user_affiliation,
format)
return(True)
except:
if i < len(remoteFileList) - 1:
continue
else:
print(('Failed to download any of %s' % (str(remoteFileList))))
return(False)
def createHdfFile(args):
# this code cannot raise an error
try:
with warnings.catch_warnings():
# we know we are calling deprecated code - surpress warnings
warnings.simplefilter("ignore")
filename, iniFile, overwrite, mad3, removeSummary, skipMad3Download, expDirNum, quiet = args
madDB = madrigal.metadata.MadrigalDB()
# skip if already Hdf5
fileName, fileExtension = os.path.splitext(filename)
if fileExtension in ('.h5', '.hdf5', '.hdf'):
if not quiet:
print(('skipping %s because already Hdf5' % (filename)))
return
hdf5Name = os.path.join(os.path.dirname(filename), 'overview', os.path.basename(filename) + '.hdf5')
if os.access(hdf5Name, os.R_OK) and not overwrite:
if not mad3:
return
else:
# check if already Madrigal3
try:
f = h5py.File(hdf5Name, 'r')
if '_record_layout' in list(f['Metadata'].keys()):
f.close()
if not quiet:
print(('skipping %s because cached file already Madrigal3' % (filename)))
return
else:
print(('overwriting cached file for %s because not Madrigal3' % (filename)))
f.close()
os.remove(hdf5Name)
except:
traceback.print_exc()
try:
f.close()
except:
pass
print(('Problem with cached file for %s - removing' % (filename)))
os.remove(hdf5Name)
if removeSummary:
summaryFile = os.path.join(os.path.dirname(filename), 'overview',
os.path.basename(filename) + '.summary')
try:
if not quiet:
print(('removing summary %s' % (summaryFile)))
os.remove(summaryFile)
except:
pass
# first try to download file from mad3Url if not skipMad3Download
result = False
if not skipMad3Download:
result = downloadMad3File(filename, mad3Url, expDirNum)
if result:
print(('Downloaded cached and summary file for %s from Mad3 CEDAR Madrigal site' % (filename)))
if not result:
# this Hdf5 file needs to be created
print(('creating hdf5 file for %s' % (filename)))
try:
madFileObj = madrigal.data.MadrigalFile(filename, madDB)
madFileObj.getCachedHdf5(iniFile, overwrite, showWarnings=True)
except:
traceback.print_exc()
except:
print('Unexpected error')
traceback.print_exc()
### main script begins here ###
if __name__ == '__main__':
instList = None
includeNonDefault = 0
includeGeo = False
iniFile = None
overwrite = False
mad3 = False
expPath = None
removeSummary = False
skipMad3Download = False
numCPU = multiprocessing.cpu_count()-2
quiet = False
mad3Url = 'http://cedar.openmadrigal.org'
try:
opts, args = getopt.getopt(sys.argv[1:], "h", ["inst=", "path=", "includeNonDefault", "overwrite", "ini=",
"mad3", "includeGeo", "numCPU=", "help", "removeSummary",
"skipMad3Download", "quiet"])
except getopt.GetoptError as err:
print(str(err))
sys.exit(2)
for o, a in opts:
if o == "--inst":
instItems = a.split(',')
instList = []
for inst in instItems:
try:
instList.append(int(inst))
except:
print(('--inst must be a comma delimited list of kinst (integers), not %s' % (a)))
raise
elif o == "--path":
expPath = a
if len(expPath) > 1 and expPath[-1] == '/':
# strip off /
expPath = expPath[:-1]
if not os.access(expPath, os.R_OK):
raise IOError('Unable to access path %s' % (expPath))
elif o in ("-h", "--help"):
print(usage)
sys.exit(-1)
elif o == '--includeNonDefault':
includeNonDefault = 1
elif o == '--includeGeo':
includeGeo = True
elif o == '--overwrite':
overwrite = True
elif o == '--mad3':
mad3 = True
elif o == '--ini':
iniFile = a
elif o == '--numCPU':
numCPU = int(a)
if numCPU < 1:
raise ValueError('numCPU must be positive, not %i' % (numCPU))
elif o == '--removeSummary':
removeSummary = True
elif o == '--skipMad3Download':
skipMad3Download = True
elif o == '--quiet':
quiet = True
else:
assert False, "unhandled option"
# get a list of all files to test for caching
madDB = madrigal.metadata.MadrigalDB()
fileList = madDB.getFileList(kinstList=instList, includeNonDefault=includeNonDefault,
path=expPath)
# possibly skip geophysical files
geoList = [120, 210, 211, 212]
expDict = {} # expDict - dict with keys = kinst, value = list of tuples of (exp sDT, eDT, expDirNum)
# created only if needed as set by skipMad3Download
filesToProcess = [] # the list to pass into the multiprocessing module to handle
numCPU = min(max(1, multiprocessing.cpu_count()-2), numCPU)
pool = multiprocessing.Pool(processes=numCPU)
print(('Creating Cached Hdf5 files using %i cpu\'s' % (numCPU)))
hdf5Exts = ('.h5', '.hdf5', '.hdf')
madWebObj = madrigalWeb.madrigalWeb.MadrigalData(mad3Url)
print('This next step may take a few hours....')
for thisFile in fileList:
# check expPath
if expPath:
if thisFile.find(expPath) == -1:
continue
# skip Hdf5 files here so things are faster for an almost competely converted Madrigal site
base, ext = os.path.splitext(thisFile)
if ext in hdf5Exts:
continue
expTab = os.path.join(os.path.dirname(thisFile), 'expTab.txt')
madExpObj = madrigal.metadata.MadrigalExperiment(madDB, expTab)
kinst = madExpObj.getKinstByPosition(0)
if kinst in geoList and not includeGeo:
continue
if not skipMad3Download:
if kinst not in list(expDict.keys()):
expList = madWebObj.getExperiments(kinst, 1950, 1, 1, 0, 0, 0, 2020, 12, 31, 23, 59, 59)
expList.sort()
data = []
for exp in expList:
sDT = datetime.datetime(exp.startyear, exp.startmonth, exp.startday,
exp.starthour, exp.startmin, exp.startsec)
eDT = datetime.datetime(exp.endyear, exp.endmonth, exp.endday,
exp.endhour, exp.endmin, exp.endsec)
url = exp.url
if url.find('experiments3') != -1:
expDirNum = '3'
elif url.find('experiments2') != -1:
expDirNum = '2'
else:
expDirNum = ''
data.append((sDT, eDT, expDirNum))
expDict[kinst] = data
sList = madExpObj.getExpStartDateTimeByPosition()[:6]
sDT = datetime.datetime(*sList)
eList = madExpObj.getExpEndDateTimeByPosition()[:6]
eDT = datetime.datetime(*eList)
mDT = sDT + (eDT - sDT)
# loop through the experiments to get right experiment directory
expDirNum = None
for sDT, eDT, thisNum in expDict[kinst]:
if sDT <= mDT and mDT <= eDT:
expDirNum = thisNum
break
else:
expDirNum = None
filesToProcess.append((thisFile, iniFile, overwrite, mad3, removeSummary, skipMad3Download, expDirNum, quiet))
# to better balance the load, apply random shuffle
random.shuffle(filesToProcess)
if len(filesToProcess):
pool.map(createHdfFile, filesToProcess, 5)
print('All HDF5 cached files successfully created')