#!PYTHONEXE """createCachedHdf5Files.py is a script that will walk all the files in a Madrigal database, and make sure all files in the old Cedar 2.X format have cached hdf5 versions. Used only when updating from Madrigal 3 to Madrigal 3. $Id: createCachedHdf5Files.py 7119 2020-06-22 20:28:21Z brideout $ """ usage = """createCachedHdf5Files.py [--inst= --path= --includeNonDefault --ini= --mad3 --overwrite --includeGeo --numCPU= -h --help --removeSummary --skipMad3Download] By default all instruments will be included. Use --inst= to only include some instruments. By default, all experiment directories will be included. Use --path to limit to a particular directory and all subdirectories. By default only default files will be cached. Use --includeNonDefault to include all files. By default, extra parameters and formats are added by the ini file $MADROOT/cachedFiles.ini. Use --ini= to specify an alternative ini file. See madrigal.data.MadrigalFile._parseCachedIni for description of the ini file format. Set --includeGeo to also convert geophysical files Use --overwrite to overwrite all Hdf5 cached files Use --mad3 to overwrite all non Madrigal3 Hdf5 files numCPU by default is the maximum of (1, numCPUs available - 2). Use --numCPU to override, but still will not be highter than default. Use --removeSummary to remove summary files before creating Hdf5 files Use --skipMad3Download to not try to dowload file from madrigal3.haystack.mit.edu -h or --help - print usage and exit """ import os, os.path, sys import getopt import time, datetime import traceback import multiprocessing import warnings import random import h5py import madrigal.metadata import madrigal.data import madrigalWeb.madrigalWeb def downloadMad3File(filename, mad3Url, expDirNum): """downloadMad3File downloads the appropriate Madrigal 3 Hdf5 associated with filename if possible from mad3Url. Also downloads the summary file. Inputs: filename - full path to filename on present Madrigal2 server mad3Url - url of Madrigal 3 CEDAR Madrigal server to get hdf5 version from expDirNum - either '' or '3' - suggestion as to what experiment directory to try first. Returns True if success, False if not. """ user_fullname = 'Bill Rideout' user_email = 'brideout@haystack.mit.edu' user_affiliation = 'MIT' format='hdf5' madroot = '/opt/madrigal3' madWebObj = madrigalWeb.madrigalWeb.MadrigalData(mad3Url) expDir = os.path.dirname(filename) basename = os.path.basename(filename) # take into account that CEDAR madrigal server has both experiments and experiments3 remoteFileList = [os.path.join(madroot, filename[filename.find('experiments'):]) + '.hdf5'] remoteFileList.append(remoteFileList[-1].replace('experiments/', 'experiments3/')) remoteSummaryFileList = [os.path.join(madroot, expDir[expDir.find('experiments'):], 'overview', basename + '.hdf5.summary')] remoteSummaryFileList.append(remoteSummaryFileList[-1].replace('experiments/', 'experiments3/')) if expDirNum == '3': # reverse both lists to try experiments3 first remoteFileList.reverse() remoteSummaryFileList.reverse() destDir = os.path.join(os.path.dirname(filename), 'overview') destination = os.path.join(destDir, os.path.basename(filename) + '.hdf5') summDest = destination + '.summary' for i in range(len(remoteFileList)): remoteFile = remoteFileList[i] remoteSummaryFile = remoteSummaryFileList[i] try: madWebObj.downloadFile(remoteFile, destination, user_fullname, user_email, user_affiliation, format) madWebObj.downloadFile(remoteSummaryFile, summDest, user_fullname, user_email, user_affiliation, format) return(True) except: if i < len(remoteFileList) - 1: continue else: print(('Failed to download any of %s' % (str(remoteFileList)))) return(False) def createHdfFile(args): # this code cannot raise an error try: with warnings.catch_warnings(): # we know we are calling deprecated code - surpress warnings warnings.simplefilter("ignore") filename, iniFile, overwrite, mad3, removeSummary, skipMad3Download, expDirNum, quiet = args madDB = madrigal.metadata.MadrigalDB() # skip if already Hdf5 fileName, fileExtension = os.path.splitext(filename) if fileExtension in ('.h5', '.hdf5', '.hdf'): if not quiet: print(('skipping %s because already Hdf5' % (filename))) return hdf5Name = os.path.join(os.path.dirname(filename), 'overview', os.path.basename(filename) + '.hdf5') if os.access(hdf5Name, os.R_OK) and not overwrite: if not mad3: return else: # check if already Madrigal3 try: f = h5py.File(hdf5Name, 'r') if '_record_layout' in list(f['Metadata'].keys()): f.close() if not quiet: print(('skipping %s because cached file already Madrigal3' % (filename))) return else: print(('overwriting cached file for %s because not Madrigal3' % (filename))) f.close() os.remove(hdf5Name) except: traceback.print_exc() try: f.close() except: pass print(('Problem with cached file for %s - removing' % (filename))) os.remove(hdf5Name) if removeSummary: summaryFile = os.path.join(os.path.dirname(filename), 'overview', os.path.basename(filename) + '.summary') try: if not quiet: print(('removing summary %s' % (summaryFile))) os.remove(summaryFile) except: pass # first try to download file from mad3Url if not skipMad3Download result = False if not skipMad3Download: result = downloadMad3File(filename, mad3Url, expDirNum) if result: print(('Downloaded cached and summary file for %s from Mad3 CEDAR Madrigal site' % (filename))) if not result: # this Hdf5 file needs to be created print(('creating hdf5 file for %s' % (filename))) try: madFileObj = madrigal.data.MadrigalFile(filename, madDB) madFileObj.getCachedHdf5(iniFile, overwrite, showWarnings=True) except: traceback.print_exc() except: print('Unexpected error') traceback.print_exc() ### main script begins here ### if __name__ == '__main__': instList = None includeNonDefault = 0 includeGeo = False iniFile = None overwrite = False mad3 = False expPath = None removeSummary = False skipMad3Download = False numCPU = multiprocessing.cpu_count()-2 quiet = False mad3Url = 'http://cedar.openmadrigal.org' try: opts, args = getopt.getopt(sys.argv[1:], "h", ["inst=", "path=", "includeNonDefault", "overwrite", "ini=", "mad3", "includeGeo", "numCPU=", "help", "removeSummary", "skipMad3Download", "quiet"]) except getopt.GetoptError as err: print(str(err)) sys.exit(2) for o, a in opts: if o == "--inst": instItems = a.split(',') instList = [] for inst in instItems: try: instList.append(int(inst)) except: print(('--inst must be a comma delimited list of kinst (integers), not %s' % (a))) raise elif o == "--path": expPath = a if len(expPath) > 1 and expPath[-1] == '/': # strip off / expPath = expPath[:-1] if not os.access(expPath, os.R_OK): raise IOError('Unable to access path %s' % (expPath)) elif o in ("-h", "--help"): print(usage) sys.exit(-1) elif o == '--includeNonDefault': includeNonDefault = 1 elif o == '--includeGeo': includeGeo = True elif o == '--overwrite': overwrite = True elif o == '--mad3': mad3 = True elif o == '--ini': iniFile = a elif o == '--numCPU': numCPU = int(a) if numCPU < 1: raise ValueError('numCPU must be positive, not %i' % (numCPU)) elif o == '--removeSummary': removeSummary = True elif o == '--skipMad3Download': skipMad3Download = True elif o == '--quiet': quiet = True else: assert False, "unhandled option" # get a list of all files to test for caching madDB = madrigal.metadata.MadrigalDB() fileList = madDB.getFileList(kinstList=instList, includeNonDefault=includeNonDefault, path=expPath) # possibly skip geophysical files geoList = [120, 210, 211, 212] expDict = {} # expDict - dict with keys = kinst, value = list of tuples of (exp sDT, eDT, expDirNum) # created only if needed as set by skipMad3Download filesToProcess = [] # the list to pass into the multiprocessing module to handle numCPU = min(max(1, multiprocessing.cpu_count()-2), numCPU) pool = multiprocessing.Pool(processes=numCPU) print(('Creating Cached Hdf5 files using %i cpu\'s' % (numCPU))) hdf5Exts = ('.h5', '.hdf5', '.hdf') madWebObj = madrigalWeb.madrigalWeb.MadrigalData(mad3Url) print('This next step may take a few hours....') for thisFile in fileList: # check expPath if expPath: if thisFile.find(expPath) == -1: continue # skip Hdf5 files here so things are faster for an almost competely converted Madrigal site base, ext = os.path.splitext(thisFile) if ext in hdf5Exts: continue expTab = os.path.join(os.path.dirname(thisFile), 'expTab.txt') madExpObj = madrigal.metadata.MadrigalExperiment(madDB, expTab) kinst = madExpObj.getKinstByPosition(0) if kinst in geoList and not includeGeo: continue if not skipMad3Download: if kinst not in list(expDict.keys()): expList = madWebObj.getExperiments(kinst, 1950, 1, 1, 0, 0, 0, 2020, 12, 31, 23, 59, 59) expList.sort() data = [] for exp in expList: sDT = datetime.datetime(exp.startyear, exp.startmonth, exp.startday, exp.starthour, exp.startmin, exp.startsec) eDT = datetime.datetime(exp.endyear, exp.endmonth, exp.endday, exp.endhour, exp.endmin, exp.endsec) url = exp.url if url.find('experiments3') != -1: expDirNum = '3' elif url.find('experiments2') != -1: expDirNum = '2' else: expDirNum = '' data.append((sDT, eDT, expDirNum)) expDict[kinst] = data sList = madExpObj.getExpStartDateTimeByPosition()[:6] sDT = datetime.datetime(*sList) eList = madExpObj.getExpEndDateTimeByPosition()[:6] eDT = datetime.datetime(*eList) mDT = sDT + (eDT - sDT) # loop through the experiments to get right experiment directory expDirNum = None for sDT, eDT, thisNum in expDict[kinst]: if sDT <= mDT and mDT <= eDT: expDirNum = thisNum break else: expDirNum = None filesToProcess.append((thisFile, iniFile, overwrite, mad3, removeSummary, skipMad3Download, expDirNum, quiet)) # to better balance the load, apply random shuffle random.shuffle(filesToProcess) if len(filesToProcess): pool.map(createHdfFile, filesToProcess, 5) print('All HDF5 cached files successfully created')