#!PYTHONEXE """createCachedFiles.py is a script that will create cached text or netCDF4 files to speed up user downloads. $Id: createCachedFiles.py 7119 2020-06-22 20:28:21Z brideout $ """ usage = """createCachedFiles.py [--excludeText --excludeNetCDF4 --inst= --kindat= --path= --includeNonDefault --overwrite --includeGeo --listOnly --numCPU= -h --help] By default, both text and netCDF4 files created. Use --excludeText or --excludeNetCDF4 to only create one type. By default all instruments will be included. Use --inst= to only include some instruments. By default all kinds of data will be included. Use --kindat= to only include some kindats By default, all experiment directories will be included. Use --path to limit to a particular directory and all subdirectories. By default only default files will be cached. Use --includeNonDefault to include all files. Use --overwrite to overwrite all Hdf5 cached files. Default is to skip existing cached files. Set --includeGeo to also convert geophysical files. Default is to skip them. Set --listOnly to simply print cached files to be created numCPU by default is the maximum of (1, numCPUs available - 2). Use --numCPU to override, but still will not be highter than default. -h or --help - print usage and exit """ # standard python imports import os, os.path, sys import getopt import time import traceback import multiprocessing import subprocess # madrigal imports import madrigal.metadata import madrigal.cedar def createCachedFiles(args): """createCachedFiles is called for each file to be checked args = (filename, excludeText, excludeNetCDF4, overwrite, listOnly) # this code cannot raise an error """ try: filename, excludeText, excludeNetCDF4, overwrite, listOnly = args if not listOnly: print('working on %s' % (filename)) sys.stdout.flush() madDB = madrigal.metadata.MadrigalDB() basename = os.path.basename(filename) expDir = os.path.dirname(filename) if not excludeText: cachedFile = os.path.join(expDir, 'overview', basename + '.txt') if not os.access(cachedFile + '.gz', os.R_OK) or overwrite: if not listOnly: madrigal.cedar.convertToText(filename, cachedFile) subprocess.check_call(['gzip', '-f', cachedFile]) else: print('ascii cache needed for %s' % (filename)) if not excludeNetCDF4: cachedFile = os.path.join(expDir, 'overview', basename + '.nc') if not os.access(cachedFile, os.R_OK) or overwrite: if not listOnly: if os.access(cachedFile, os.R_OK): os.remove(cachedFile) try: madrigal.cedar.convertToNetCDF4(filename, cachedFile) except IOError: cedarObj = madrigal.cedar.MadrigalCedarFile(filename) cedarObj.write('netCDF4', cachedFile) else: print('netCDF4 cache needed for %s' % (filename)) except: print('Unexpected error') traceback.print_exc() ### main script begins here ### if __name__ == '__main__': excludeText = False excludeNetCDF4 = False instList = None kindatList = None includeNonDefault = 0 includeGeo = False overwrite = False expPath = None listOnly = False numCPU = multiprocessing.cpu_count()-2 try: opts, args = getopt.getopt(sys.argv[1:], "h", ["excludeText", "excludeNetCDF4", "inst=", "kindat=", "path=", "includeNonDefault", "overwrite", "includeGeo", "numCPU=", "listOnly", "help"]) except getopt.GetoptError as err: print(str(err)) sys.exit(2) for o, a in opts: if o == '--excludeText': excludeText = True elif o == '--excludeNetCDF4': excludeNetCDF4 = True elif o == "--inst": instItems = a.split(',') instList = [] for inst in instItems: try: instList.append(int(inst)) except: print(('--inst must be a comma delimited list of kinst (integers), not %s' % (a))) raise elif o == "--kindat": kindatItems = a.split(',') kindatList = [] for kindat in kindatItems: try: kindatList.append(int(kindat)) except: print(('--kindat must be a comma delimited list of kindat codes (integers), not %s' % (a))) raise elif o == "--path": expPath = a if len(expPath) > 1 and expPath[-1] == '/': # strip off / expPath = expPath[:-1] if not os.access(expPath, os.R_OK): raise IOError('Unable to access path %s' % (expPath)) elif o in ("-h", "--help"): print(usage) sys.exit(-1) elif o == '--includeNonDefault': includeNonDefault = 1 elif o == '--includeGeo': includeGeo = True elif o == '--overwrite': overwrite = True elif o == '--listOnly': listOnly = True elif o == '--numCPU': numCPU = int(a) if numCPU < 1: raise ValueError('numCPU must be positive, not %i' % (numCPU)) else: assert False, "unhandled option" if excludeText and excludeNetCDF4: print('Nothing to be done since both text and netCDF4 cached files excluded') sys.exit(0) # get a list of all files to test for caching madDB = madrigal.metadata.MadrigalDB() fileList = madDB.getFileList(kinstList=instList, kindatList=kindatList, includeNonDefault=includeNonDefault, path=expPath) # possibly skip geophysical files geoList = [120, 210, 211, 212] filesToProcess = [] # the list to pass into the multiprocessing module to handle numCPU = min(max(1, multiprocessing.cpu_count()-2), numCPU) pool = multiprocessing.Pool(processes=numCPU) print(('Creating cached files using %i cpu\'s' % (numCPU))) print('This next step may take a few hours....') for thisFile in fileList: # check expPath if expPath: if thisFile.find(expPath) == -1: continue expTab = os.path.join(os.path.dirname(thisFile), 'expTab.txt') madExpObj = madrigal.metadata.MadrigalExperiment(madDB, expTab) kinst = madExpObj.getKinstByPosition(0) if kinst in geoList and not includeGeo: continue filesToProcess.append((thisFile, excludeText, excludeNetCDF4, overwrite, listOnly)) if len(filesToProcess) > 200: pool.map(createCachedFiles, filesToProcess) filesToProcess = [] # get all remaining if len(filesToProcess): pool.map(createCachedFiles, filesToProcess) print('All cached files successfully created')