|
|
#!PYTHONEXE
|
|
|
|
|
|
"""createCachedHdf5Files.py is a script that will walk all the files in a Madrigal database, and make sure all
|
|
|
files in the old Cedar 2.X format have cached hdf5 versions. Used only when updating from Madrigal 3 to
|
|
|
Madrigal 3.
|
|
|
|
|
|
$Id: createCachedHdf5Files.py 7119 2020-06-22 20:28:21Z brideout $
|
|
|
"""
|
|
|
|
|
|
usage = """createCachedHdf5Files.py [--inst=<instList> --path=<expPath> --includeNonDefault --ini=<iniFile> --mad3 --overwrite
|
|
|
--includeGeo --numCPU=<numCPU> -h --help --removeSummary --skipMad3Download]
|
|
|
By default all instruments will be included. Use --inst=<comma delimited kinst list> to only include some instruments.
|
|
|
By default, all experiment directories will be included. Use --path to limit to a particular directory and all subdirectories.
|
|
|
By default only default files will be cached. Use --includeNonDefault to include all files.
|
|
|
By default, extra parameters and formats are added by the ini file $MADROOT/cachedFiles.ini. Use
|
|
|
--ini=<iniFile> to specify an alternative ini file. See madrigal.data.MadrigalFile._parseCachedIni for description of
|
|
|
the ini file format. Set --includeGeo to also convert geophysical files
|
|
|
Use --overwrite to overwrite all Hdf5 cached files
|
|
|
Use --mad3 to overwrite all non Madrigal3 Hdf5 files
|
|
|
numCPU by default is the maximum of (1, numCPUs available - 2). Use --numCPU to override, but still will not be highter than default.
|
|
|
Use --removeSummary to remove summary files before creating Hdf5 files
|
|
|
Use --skipMad3Download to not try to dowload file from madrigal3.haystack.mit.edu
|
|
|
-h or --help - print usage and exit
|
|
|
"""
|
|
|
|
|
|
import os, os.path, sys
|
|
|
import getopt
|
|
|
import time, datetime
|
|
|
import traceback
|
|
|
import multiprocessing
|
|
|
import warnings
|
|
|
import random
|
|
|
|
|
|
import h5py
|
|
|
|
|
|
import madrigal.metadata
|
|
|
import madrigal.data
|
|
|
import madrigalWeb.madrigalWeb
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def downloadMad3File(filename, mad3Url, expDirNum):
|
|
|
"""downloadMad3File downloads the appropriate Madrigal 3 Hdf5 associated with filename if possible
|
|
|
from mad3Url. Also downloads the summary file.
|
|
|
|
|
|
Inputs:
|
|
|
filename - full path to filename on present Madrigal2 server
|
|
|
mad3Url - url of Madrigal 3 CEDAR Madrigal server to get hdf5 version from
|
|
|
expDirNum - either '' or '3' - suggestion as to what experiment directory to try first.
|
|
|
|
|
|
Returns True if success, False if not.
|
|
|
"""
|
|
|
user_fullname = 'Bill Rideout'
|
|
|
user_email = 'brideout@haystack.mit.edu'
|
|
|
user_affiliation = 'MIT'
|
|
|
format='hdf5'
|
|
|
madroot = '/opt/madrigal3'
|
|
|
madWebObj = madrigalWeb.madrigalWeb.MadrigalData(mad3Url)
|
|
|
expDir = os.path.dirname(filename)
|
|
|
basename = os.path.basename(filename)
|
|
|
|
|
|
# take into account that CEDAR madrigal server has both experiments and experiments3
|
|
|
remoteFileList = [os.path.join(madroot, filename[filename.find('experiments'):]) + '.hdf5']
|
|
|
remoteFileList.append(remoteFileList[-1].replace('experiments/', 'experiments3/'))
|
|
|
|
|
|
remoteSummaryFileList = [os.path.join(madroot, expDir[expDir.find('experiments'):], 'overview', basename + '.hdf5.summary')]
|
|
|
remoteSummaryFileList.append(remoteSummaryFileList[-1].replace('experiments/', 'experiments3/'))
|
|
|
|
|
|
if expDirNum == '3':
|
|
|
# reverse both lists to try experiments3 first
|
|
|
remoteFileList.reverse()
|
|
|
remoteSummaryFileList.reverse()
|
|
|
|
|
|
destDir = os.path.join(os.path.dirname(filename), 'overview')
|
|
|
destination = os.path.join(destDir, os.path.basename(filename) + '.hdf5')
|
|
|
summDest = destination + '.summary'
|
|
|
|
|
|
for i in range(len(remoteFileList)):
|
|
|
remoteFile = remoteFileList[i]
|
|
|
remoteSummaryFile = remoteSummaryFileList[i]
|
|
|
try:
|
|
|
madWebObj.downloadFile(remoteFile, destination, user_fullname, user_email, user_affiliation,
|
|
|
format)
|
|
|
madWebObj.downloadFile(remoteSummaryFile, summDest, user_fullname, user_email, user_affiliation,
|
|
|
format)
|
|
|
return(True)
|
|
|
except:
|
|
|
if i < len(remoteFileList) - 1:
|
|
|
continue
|
|
|
else:
|
|
|
print(('Failed to download any of %s' % (str(remoteFileList))))
|
|
|
return(False)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def createHdfFile(args):
|
|
|
# this code cannot raise an error
|
|
|
try:
|
|
|
with warnings.catch_warnings():
|
|
|
# we know we are calling deprecated code - surpress warnings
|
|
|
warnings.simplefilter("ignore")
|
|
|
filename, iniFile, overwrite, mad3, removeSummary, skipMad3Download, expDirNum, quiet = args
|
|
|
madDB = madrigal.metadata.MadrigalDB()
|
|
|
# skip if already Hdf5
|
|
|
fileName, fileExtension = os.path.splitext(filename)
|
|
|
if fileExtension in ('.h5', '.hdf5', '.hdf'):
|
|
|
if not quiet:
|
|
|
print(('skipping %s because already Hdf5' % (filename)))
|
|
|
return
|
|
|
hdf5Name = os.path.join(os.path.dirname(filename), 'overview', os.path.basename(filename) + '.hdf5')
|
|
|
if os.access(hdf5Name, os.R_OK) and not overwrite:
|
|
|
if not mad3:
|
|
|
return
|
|
|
else:
|
|
|
# check if already Madrigal3
|
|
|
try:
|
|
|
f = h5py.File(hdf5Name, 'r')
|
|
|
if '_record_layout' in list(f['Metadata'].keys()):
|
|
|
f.close()
|
|
|
if not quiet:
|
|
|
print(('skipping %s because cached file already Madrigal3' % (filename)))
|
|
|
return
|
|
|
else:
|
|
|
print(('overwriting cached file for %s because not Madrigal3' % (filename)))
|
|
|
f.close()
|
|
|
os.remove(hdf5Name)
|
|
|
except:
|
|
|
traceback.print_exc()
|
|
|
try:
|
|
|
f.close()
|
|
|
except:
|
|
|
pass
|
|
|
print(('Problem with cached file for %s - removing' % (filename)))
|
|
|
os.remove(hdf5Name)
|
|
|
|
|
|
if removeSummary:
|
|
|
summaryFile = os.path.join(os.path.dirname(filename), 'overview',
|
|
|
os.path.basename(filename) + '.summary')
|
|
|
try:
|
|
|
if not quiet:
|
|
|
print(('removing summary %s' % (summaryFile)))
|
|
|
os.remove(summaryFile)
|
|
|
except:
|
|
|
pass
|
|
|
|
|
|
# first try to download file from mad3Url if not skipMad3Download
|
|
|
result = False
|
|
|
if not skipMad3Download:
|
|
|
result = downloadMad3File(filename, mad3Url, expDirNum)
|
|
|
if result:
|
|
|
print(('Downloaded cached and summary file for %s from Mad3 CEDAR Madrigal site' % (filename)))
|
|
|
|
|
|
if not result:
|
|
|
# this Hdf5 file needs to be created
|
|
|
print(('creating hdf5 file for %s' % (filename)))
|
|
|
try:
|
|
|
madFileObj = madrigal.data.MadrigalFile(filename, madDB)
|
|
|
madFileObj.getCachedHdf5(iniFile, overwrite, showWarnings=True)
|
|
|
except:
|
|
|
traceback.print_exc()
|
|
|
|
|
|
except:
|
|
|
print('Unexpected error')
|
|
|
traceback.print_exc()
|
|
|
|
|
|
|
|
|
### main script begins here ###
|
|
|
if __name__ == '__main__':
|
|
|
|
|
|
instList = None
|
|
|
includeNonDefault = 0
|
|
|
includeGeo = False
|
|
|
iniFile = None
|
|
|
overwrite = False
|
|
|
mad3 = False
|
|
|
expPath = None
|
|
|
removeSummary = False
|
|
|
skipMad3Download = False
|
|
|
numCPU = multiprocessing.cpu_count()-2
|
|
|
quiet = False
|
|
|
|
|
|
mad3Url = 'http://cedar.openmadrigal.org'
|
|
|
|
|
|
try:
|
|
|
opts, args = getopt.getopt(sys.argv[1:], "h", ["inst=", "path=", "includeNonDefault", "overwrite", "ini=",
|
|
|
"mad3", "includeGeo", "numCPU=", "help", "removeSummary",
|
|
|
"skipMad3Download", "quiet"])
|
|
|
except getopt.GetoptError as err:
|
|
|
print(str(err))
|
|
|
sys.exit(2)
|
|
|
for o, a in opts:
|
|
|
if o == "--inst":
|
|
|
instItems = a.split(',')
|
|
|
instList = []
|
|
|
for inst in instItems:
|
|
|
try:
|
|
|
instList.append(int(inst))
|
|
|
except:
|
|
|
print(('--inst must be a comma delimited list of kinst (integers), not %s' % (a)))
|
|
|
raise
|
|
|
elif o == "--path":
|
|
|
expPath = a
|
|
|
if len(expPath) > 1 and expPath[-1] == '/':
|
|
|
# strip off /
|
|
|
expPath = expPath[:-1]
|
|
|
if not os.access(expPath, os.R_OK):
|
|
|
raise IOError('Unable to access path %s' % (expPath))
|
|
|
elif o in ("-h", "--help"):
|
|
|
print(usage)
|
|
|
sys.exit(-1)
|
|
|
elif o == '--includeNonDefault':
|
|
|
includeNonDefault = 1
|
|
|
elif o == '--includeGeo':
|
|
|
includeGeo = True
|
|
|
elif o == '--overwrite':
|
|
|
overwrite = True
|
|
|
elif o == '--mad3':
|
|
|
mad3 = True
|
|
|
elif o == '--ini':
|
|
|
iniFile = a
|
|
|
elif o == '--numCPU':
|
|
|
numCPU = int(a)
|
|
|
if numCPU < 1:
|
|
|
raise ValueError('numCPU must be positive, not %i' % (numCPU))
|
|
|
elif o == '--removeSummary':
|
|
|
removeSummary = True
|
|
|
elif o == '--skipMad3Download':
|
|
|
skipMad3Download = True
|
|
|
elif o == '--quiet':
|
|
|
quiet = True
|
|
|
else:
|
|
|
assert False, "unhandled option"
|
|
|
|
|
|
# get a list of all files to test for caching
|
|
|
madDB = madrigal.metadata.MadrigalDB()
|
|
|
fileList = madDB.getFileList(kinstList=instList, includeNonDefault=includeNonDefault,
|
|
|
path=expPath)
|
|
|
|
|
|
# possibly skip geophysical files
|
|
|
geoList = [120, 210, 211, 212]
|
|
|
|
|
|
expDict = {} # expDict - dict with keys = kinst, value = list of tuples of (exp sDT, eDT, expDirNum)
|
|
|
# created only if needed as set by skipMad3Download
|
|
|
|
|
|
filesToProcess = [] # the list to pass into the multiprocessing module to handle
|
|
|
numCPU = min(max(1, multiprocessing.cpu_count()-2), numCPU)
|
|
|
pool = multiprocessing.Pool(processes=numCPU)
|
|
|
print(('Creating Cached Hdf5 files using %i cpu\'s' % (numCPU)))
|
|
|
|
|
|
hdf5Exts = ('.h5', '.hdf5', '.hdf')
|
|
|
madWebObj = madrigalWeb.madrigalWeb.MadrigalData(mad3Url)
|
|
|
|
|
|
print('This next step may take a few hours....')
|
|
|
for thisFile in fileList:
|
|
|
# check expPath
|
|
|
if expPath:
|
|
|
if thisFile.find(expPath) == -1:
|
|
|
continue
|
|
|
|
|
|
# skip Hdf5 files here so things are faster for an almost competely converted Madrigal site
|
|
|
base, ext = os.path.splitext(thisFile)
|
|
|
if ext in hdf5Exts:
|
|
|
continue
|
|
|
|
|
|
expTab = os.path.join(os.path.dirname(thisFile), 'expTab.txt')
|
|
|
madExpObj = madrigal.metadata.MadrigalExperiment(madDB, expTab)
|
|
|
kinst = madExpObj.getKinstByPosition(0)
|
|
|
if kinst in geoList and not includeGeo:
|
|
|
continue
|
|
|
|
|
|
if not skipMad3Download:
|
|
|
if kinst not in list(expDict.keys()):
|
|
|
expList = madWebObj.getExperiments(kinst, 1950, 1, 1, 0, 0, 0, 2020, 12, 31, 23, 59, 59)
|
|
|
expList.sort()
|
|
|
data = []
|
|
|
for exp in expList:
|
|
|
sDT = datetime.datetime(exp.startyear, exp.startmonth, exp.startday,
|
|
|
exp.starthour, exp.startmin, exp.startsec)
|
|
|
eDT = datetime.datetime(exp.endyear, exp.endmonth, exp.endday,
|
|
|
exp.endhour, exp.endmin, exp.endsec)
|
|
|
url = exp.url
|
|
|
if url.find('experiments3') != -1:
|
|
|
expDirNum = '3'
|
|
|
elif url.find('experiments2') != -1:
|
|
|
expDirNum = '2'
|
|
|
else:
|
|
|
expDirNum = ''
|
|
|
data.append((sDT, eDT, expDirNum))
|
|
|
expDict[kinst] = data
|
|
|
|
|
|
sList = madExpObj.getExpStartDateTimeByPosition()[:6]
|
|
|
sDT = datetime.datetime(*sList)
|
|
|
eList = madExpObj.getExpEndDateTimeByPosition()[:6]
|
|
|
eDT = datetime.datetime(*eList)
|
|
|
mDT = sDT + (eDT - sDT)
|
|
|
|
|
|
# loop through the experiments to get right experiment directory
|
|
|
expDirNum = None
|
|
|
for sDT, eDT, thisNum in expDict[kinst]:
|
|
|
if sDT <= mDT and mDT <= eDT:
|
|
|
expDirNum = thisNum
|
|
|
break
|
|
|
else:
|
|
|
expDirNum = None
|
|
|
|
|
|
|
|
|
filesToProcess.append((thisFile, iniFile, overwrite, mad3, removeSummary, skipMad3Download, expDirNum, quiet))
|
|
|
|
|
|
|
|
|
# to better balance the load, apply random shuffle
|
|
|
random.shuffle(filesToProcess)
|
|
|
|
|
|
if len(filesToProcess):
|
|
|
pool.map(createHdfFile, filesToProcess, 5)
|
|
|
print('All HDF5 cached files successfully created')
|
|
|
|