DATABASES/madrigal Files · source/madpy/scripts/bin/createCachedHdf5Files.py

Add BASE_URL in settings to work with proxys

jespinoza - - Load All Authors

File last commit:

r0:b84e1135c2c4


                r18:5a8055e18e7b

Download file

             createCachedHdf5Files.py
        
                    320 lines
            
             | 13.0 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / source / madpy / scripts / bin / createCachedHdf5Files.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      #!PYTHONEXE

      """createCachedHdf5Files.py is a script that will walk all the files in a Madrigal database, and make sure all

      files in the old Cedar 2.X format have cached hdf5 versions.  Used only when updating from Madrigal 3 to 

      Madrigal 3.

      $Id: createCachedHdf5Files.py 7119 2020-06-22 20:28:21Z brideout $

      """

      usage = """createCachedHdf5Files.py [--inst=<instList>  --path=<expPath> --includeNonDefault --ini=<iniFile> --mad3 --overwrite

      --includeGeo --numCPU=<numCPU> -h --help --removeSummary --skipMad3Download]

      By default all instruments will be included.  Use --inst=<comma delimited kinst list> to only include some instruments.

      By default, all experiment directories will be included.  Use --path to limit to a particular directory and all subdirectories.

      By default only default files will be cached.  Use --includeNonDefault to include all files.

      By default, extra parameters and formats are added by the ini file $MADROOT/cachedFiles.ini.  Use

          --ini=<iniFile> to specify an alternative ini file.  See madrigal.data.MadrigalFile._parseCachedIni for description of

          the ini file format.  Set --includeGeo to also convert geophysical files

      Use --overwrite to overwrite all Hdf5 cached files

      Use --mad3 to overwrite all non Madrigal3 Hdf5 files

      numCPU by default is the maximum of (1, numCPUs available - 2).  Use --numCPU to override, but still will not be highter than default.

      Use --removeSummary to remove summary files before creating Hdf5 files

      Use --skipMad3Download to not try to dowload file from madrigal3.haystack.mit.edu

      -h or --help - print usage and exit

      """

      import os, os.path, sys

      import getopt

      import time, datetime

      import traceback

      import multiprocessing

      import warnings

      import random

      import h5py

      import madrigal.metadata

      import madrigal.data

      import madrigalWeb.madrigalWeb

      def downloadMad3File(filename, mad3Url, expDirNum):

          """downloadMad3File downloads the appropriate Madrigal 3 Hdf5 associated with filename if possible

          from mad3Url. Also downloads the summary file.

          Inputs:

              filename - full path to filename on present Madrigal2 server

              mad3Url - url of Madrigal 3 CEDAR Madrigal server to get hdf5 version from

              expDirNum - either '' or '3' - suggestion as to what experiment directory to try first.

          Returns True if success, False if not.

          """

          user_fullname = 'Bill Rideout' 

          user_email = 'brideout@haystack.mit.edu' 

          user_affiliation = 'MIT' 

          format='hdf5'

          madroot = '/opt/madrigal3'

          madWebObj = madrigalWeb.madrigalWeb.MadrigalData(mad3Url)

          expDir = os.path.dirname(filename)

          basename = os.path.basename(filename)

          # take into account that CEDAR madrigal server has both experiments and experiments3

          remoteFileList = [os.path.join(madroot, filename[filename.find('experiments'):]) + '.hdf5']

          remoteFileList.append(remoteFileList[-1].replace('experiments/', 'experiments3/'))

          remoteSummaryFileList = [os.path.join(madroot, expDir[expDir.find('experiments'):], 'overview', basename + '.hdf5.summary')]

          remoteSummaryFileList.append(remoteSummaryFileList[-1].replace('experiments/', 'experiments3/'))

          if expDirNum == '3':

              # reverse both lists to try experiments3 first

              remoteFileList.reverse()

              remoteSummaryFileList.reverse()

          destDir = os.path.join(os.path.dirname(filename), 'overview')

          destination = os.path.join(destDir, os.path.basename(filename) + '.hdf5')

          summDest = destination + '.summary'

          for i in range(len(remoteFileList)):

              remoteFile = remoteFileList[i]

              remoteSummaryFile = remoteSummaryFileList[i]

              try:

                  madWebObj.downloadFile(remoteFile, destination, user_fullname, user_email, user_affiliation, 

                                         format)

                  madWebObj.downloadFile(remoteSummaryFile, summDest, user_fullname, user_email, user_affiliation, 

                                         format)

                  return(True)

              except:

                  if i < len(remoteFileList) - 1:

                      continue

                  else:

                      print(('Failed to download any of %s' % (str(remoteFileList))))

                      return(False)

      def createHdfFile(args):

          # this code cannot raise an error

          try:

              with warnings.catch_warnings():

                  # we know we are calling deprecated code - surpress warnings

                  warnings.simplefilter("ignore")

                  filename, iniFile, overwrite, mad3, removeSummary, skipMad3Download, expDirNum,  quiet = args

                  madDB = madrigal.metadata.MadrigalDB()

                  # skip if already Hdf5

                  fileName, fileExtension = os.path.splitext(filename)

                  if fileExtension in ('.h5', '.hdf5', '.hdf'):

                      if not quiet:

                          print(('skipping %s because already Hdf5' % (filename)))

                      return

                  hdf5Name = os.path.join(os.path.dirname(filename), 'overview', os.path.basename(filename) + '.hdf5')

                  if os.access(hdf5Name, os.R_OK) and not overwrite:

                      if not mad3:

                          return

                      else:

                          # check if already Madrigal3

                          try:

                              f = h5py.File(hdf5Name, 'r')

                              if '_record_layout' in list(f['Metadata'].keys()):

                                  f.close()

                                  if not quiet:

                                      print(('skipping %s because cached file already Madrigal3' % (filename)))

                                  return

                              else:

                                  print(('overwriting cached file for %s because not Madrigal3' % (filename)))

                                  f.close()

                                  os.remove(hdf5Name)

                          except:

                              traceback.print_exc()

                              try:

                                  f.close()

                              except:

                                  pass

                              print(('Problem with cached file for %s - removing' % (filename)))

                              os.remove(hdf5Name)

                  if removeSummary:

                      summaryFile = os.path.join(os.path.dirname(filename), 'overview', 

                                                 os.path.basename(filename) + '.summary')

                      try:

                          if not quiet:

                              print(('removing summary %s' % (summaryFile)))

                          os.remove(summaryFile)

                      except:

                          pass

                  # first try to download file from mad3Url if not skipMad3Download

                  result = False

                  if not skipMad3Download:

                      result = downloadMad3File(filename, mad3Url, expDirNum)

                      if result:

                          print(('Downloaded cached and summary file for %s from Mad3 CEDAR Madrigal site' % (filename)))

                  if not result:

                      # this Hdf5 file needs to be created

                      print(('creating hdf5 file for %s' % (filename)))

                      try:

                          madFileObj = madrigal.data.MadrigalFile(filename, madDB)

                          madFileObj.getCachedHdf5(iniFile, overwrite, showWarnings=True)

                      except:

                          traceback.print_exc()

          except:

              print('Unexpected error')

              traceback.print_exc()

      ### main script begins here ###

      if __name__ == '__main__':

          instList = None

          includeNonDefault = 0

          includeGeo = False

          iniFile = None

          overwrite = False

          mad3 = False

          expPath = None

          removeSummary = False

          skipMad3Download = False

          numCPU = multiprocessing.cpu_count()-2

          quiet = False

          mad3Url = 'http://cedar.openmadrigal.org'

          try:

              opts, args = getopt.getopt(sys.argv[1:], "h", ["inst=", "path=", "includeNonDefault", "overwrite", "ini=", 

                                                             "mad3", "includeGeo", "numCPU=", "help", "removeSummary",

                                                             "skipMad3Download", "quiet"])

          except getopt.GetoptError as err:

              print(str(err)) 

              sys.exit(2)

          for o, a in opts:

              if o == "--inst":

                  instItems = a.split(',')

                  instList = []

                  for inst in instItems:

                      try:

                          instList.append(int(inst))

                      except:

                          print(('--inst must be a comma delimited list of kinst (integers), not %s' % (a)))

                          raise

              elif o == "--path":

                  expPath = a

                  if len(expPath) > 1 and expPath[-1] == '/':

                      # strip off /

                      expPath = expPath[:-1]

                  if not os.access(expPath, os.R_OK):

                      raise IOError('Unable to access path %s' % (expPath))

              elif o in ("-h", "--help"):

                  print(usage)

                  sys.exit(-1)

              elif o == '--includeNonDefault':

                  includeNonDefault = 1

              elif o == '--includeGeo':

                  includeGeo = True

              elif o == '--overwrite':

                  overwrite = True

              elif o == '--mad3':

                  mad3 = True

              elif o == '--ini':

                  iniFile = a

              elif o == '--numCPU':

                  numCPU = int(a)

                  if numCPU < 1:

                      raise ValueError('numCPU must be positive, not %i' % (numCPU))

              elif o == '--removeSummary':

                  removeSummary = True

              elif o == '--skipMad3Download':

                  skipMad3Download = True

              elif o == '--quiet':

                  quiet = True

              else:

                  assert False, "unhandled option"

          # get a list of all files to test for caching

          madDB = madrigal.metadata.MadrigalDB()

          fileList = madDB.getFileList(kinstList=instList, includeNonDefault=includeNonDefault,

                                       path=expPath)

          # possibly skip geophysical files

          geoList = [120, 210, 211, 212]

          expDict = {} # expDict - dict with keys = kinst, value = list of tuples of (exp sDT, eDT, expDirNum)

                       # created only if needed as set by skipMad3Download

          filesToProcess = [] # the list to pass into the multiprocessing module to handle

          numCPU = min(max(1, multiprocessing.cpu_count()-2), numCPU)

          pool = multiprocessing.Pool(processes=numCPU) 

          print(('Creating Cached Hdf5 files using %i cpu\'s' % (numCPU)))

          hdf5Exts = ('.h5', '.hdf5', '.hdf')

          madWebObj = madrigalWeb.madrigalWeb.MadrigalData(mad3Url)

          print('This next step may take a few hours....')

          for thisFile in fileList:

              # check expPath

              if expPath:

                  if thisFile.find(expPath) == -1:

                      continue

              # skip Hdf5 files here so things are faster for an almost competely converted Madrigal site

              base, ext = os.path.splitext(thisFile)

              if ext in hdf5Exts:

                  continue

              expTab = os.path.join(os.path.dirname(thisFile), 'expTab.txt')

              madExpObj = madrigal.metadata.MadrigalExperiment(madDB, expTab)

              kinst = madExpObj.getKinstByPosition(0)

              if kinst in geoList and not includeGeo:

                  continue

              if not skipMad3Download:

                  if kinst not in list(expDict.keys()):

                      expList = madWebObj.getExperiments(kinst, 1950, 1, 1, 0, 0, 0, 2020, 12, 31, 23, 59, 59)

                      expList.sort()

                      data = []

                      for exp in expList:

                          sDT = datetime.datetime(exp.startyear, exp.startmonth, exp.startday,

                                                  exp.starthour, exp.startmin, exp.startsec)

                          eDT = datetime.datetime(exp.endyear, exp.endmonth, exp.endday,

                                                  exp.endhour, exp.endmin, exp.endsec)

                          url = exp.url

                          if url.find('experiments3') != -1:

                              expDirNum = '3'

                          elif url.find('experiments2') != -1:

                              expDirNum = '2'

                          else:

                              expDirNum = ''

                          data.append((sDT, eDT, expDirNum))

                      expDict[kinst] = data

                  sList = madExpObj.getExpStartDateTimeByPosition()[:6]

                  sDT = datetime.datetime(*sList)

                  eList = madExpObj.getExpEndDateTimeByPosition()[:6]

                  eDT = datetime.datetime(*eList)

                  mDT = sDT + (eDT - sDT)

                  # loop through the experiments to get right experiment directory

                  expDirNum = None

                  for sDT, eDT, thisNum in expDict[kinst]:

                      if sDT <= mDT and mDT <= eDT:

                          expDirNum = thisNum

                          break

              else:

                  expDirNum = None

              filesToProcess.append((thisFile, iniFile, overwrite, mad3, removeSummary, skipMad3Download, expDirNum, quiet))

          # to better balance the load, apply random shuffle

          random.shuffle(filesToProcess)

          if len(filesToProcess):

              pool.map(createHdfFile, filesToProcess, 5)

          print('All HDF5 cached files successfully created')

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				#!PYTHONEXE

				"""createCachedHdf5Files.py is a script that will walk all the files in a Madrigal database, and make sure all
				files in the old Cedar 2.X format have cached hdf5 versions. Used only when updating from Madrigal 3 to
				Madrigal 3.

				$Id: createCachedHdf5Files.py 7119 2020-06-22 20:28:21Z brideout $
				"""

				usage = """createCachedHdf5Files.py [--inst=<instList> --path=<expPath> --includeNonDefault --ini=<iniFile> --mad3 --overwrite
				--includeGeo --numCPU=<numCPU> -h --help --removeSummary --skipMad3Download]
				By default all instruments will be included. Use --inst=<comma delimited kinst list> to only include some instruments.
				By default, all experiment directories will be included. Use --path to limit to a particular directory and all subdirectories.
				By default only default files will be cached. Use --includeNonDefault to include all files.
				By default, extra parameters and formats are added by the ini file $MADROOT/cachedFiles.ini. Use
				--ini=<iniFile> to specify an alternative ini file. See madrigal.data.MadrigalFile._parseCachedIni for description of
				the ini file format. Set --includeGeo to also convert geophysical files
				Use --overwrite to overwrite all Hdf5 cached files
				Use --mad3 to overwrite all non Madrigal3 Hdf5 files
				numCPU by default is the maximum of (1, numCPUs available - 2). Use --numCPU to override, but still will not be highter than default.
				Use --removeSummary to remove summary files before creating Hdf5 files
				Use --skipMad3Download to not try to dowload file from madrigal3.haystack.mit.edu
				-h or --help - print usage and exit
				"""

				import os, os.path, sys
				import getopt
				import time, datetime
				import traceback
				import multiprocessing
				import warnings
				import random

				import h5py

				import madrigal.metadata
				import madrigal.data
				import madrigalWeb.madrigalWeb




				def downloadMad3File(filename, mad3Url, expDirNum):
				"""downloadMad3File downloads the appropriate Madrigal 3 Hdf5 associated with filename if possible
				from mad3Url. Also downloads the summary file.

				Inputs:
				filename - full path to filename on present Madrigal2 server
				mad3Url - url of Madrigal 3 CEDAR Madrigal server to get hdf5 version from
				expDirNum - either '' or '3' - suggestion as to what experiment directory to try first.

				Returns True if success, False if not.
				"""
				user_fullname = 'Bill Rideout'
				user_email = 'brideout@haystack.mit.edu'
				user_affiliation = 'MIT'
				format='hdf5'
				madroot = '/opt/madrigal3'
				madWebObj = madrigalWeb.madrigalWeb.MadrigalData(mad3Url)
				expDir = os.path.dirname(filename)
				basename = os.path.basename(filename)

				# take into account that CEDAR madrigal server has both experiments and experiments3
				remoteFileList = [os.path.join(madroot, filename[filename.find('experiments'):]) + '.hdf5']
				remoteFileList.append(remoteFileList[-1].replace('experiments/', 'experiments3/'))

				remoteSummaryFileList = [os.path.join(madroot, expDir[expDir.find('experiments'):], 'overview', basename + '.hdf5.summary')]
				remoteSummaryFileList.append(remoteSummaryFileList[-1].replace('experiments/', 'experiments3/'))

				if expDirNum == '3':
				# reverse both lists to try experiments3 first
				remoteFileList.reverse()
				remoteSummaryFileList.reverse()

				destDir = os.path.join(os.path.dirname(filename), 'overview')
				destination = os.path.join(destDir, os.path.basename(filename) + '.hdf5')
				summDest = destination + '.summary'

				for i in range(len(remoteFileList)):
				remoteFile = remoteFileList[i]
				remoteSummaryFile = remoteSummaryFileList[i]
				try:
				madWebObj.downloadFile(remoteFile, destination, user_fullname, user_email, user_affiliation,
				format)
				madWebObj.downloadFile(remoteSummaryFile, summDest, user_fullname, user_email, user_affiliation,
				format)
				return(True)
				except:
				if i < len(remoteFileList) - 1:
				continue
				else:
				print(('Failed to download any of %s' % (str(remoteFileList))))
				return(False)







				def createHdfFile(args):
				# this code cannot raise an error
				try:
				with warnings.catch_warnings():
				# we know we are calling deprecated code - surpress warnings
				warnings.simplefilter("ignore")
				filename, iniFile, overwrite, mad3, removeSummary, skipMad3Download, expDirNum, quiet = args
				madDB = madrigal.metadata.MadrigalDB()
				# skip if already Hdf5
				fileName, fileExtension = os.path.splitext(filename)
				if fileExtension in ('.h5', '.hdf5', '.hdf'):
				if not quiet:
				print(('skipping %s because already Hdf5' % (filename)))
				return
				hdf5Name = os.path.join(os.path.dirname(filename), 'overview', os.path.basename(filename) + '.hdf5')
				if os.access(hdf5Name, os.R_OK) and not overwrite:
				if not mad3:
				return
				else:
				# check if already Madrigal3
				try:
				f = h5py.File(hdf5Name, 'r')
				if '_record_layout' in list(f['Metadata'].keys()):
				f.close()
				if not quiet:
				print(('skipping %s because cached file already Madrigal3' % (filename)))
				return
				else:
				print(('overwriting cached file for %s because not Madrigal3' % (filename)))
				f.close()
				os.remove(hdf5Name)
				except:
				traceback.print_exc()
				try:
				f.close()
				except:
				pass
				print(('Problem with cached file for %s - removing' % (filename)))
				os.remove(hdf5Name)

				if removeSummary:
				summaryFile = os.path.join(os.path.dirname(filename), 'overview',
				os.path.basename(filename) + '.summary')
				try:
				if not quiet:
				print(('removing summary %s' % (summaryFile)))
				os.remove(summaryFile)
				except:
				pass

				# first try to download file from mad3Url if not skipMad3Download
				result = False
				if not skipMad3Download:
				result = downloadMad3File(filename, mad3Url, expDirNum)
				if result:
				print(('Downloaded cached and summary file for %s from Mad3 CEDAR Madrigal site' % (filename)))

				if not result:
				# this Hdf5 file needs to be created
				print(('creating hdf5 file for %s' % (filename)))
				try:
				madFileObj = madrigal.data.MadrigalFile(filename, madDB)
				madFileObj.getCachedHdf5(iniFile, overwrite, showWarnings=True)
				except:
				traceback.print_exc()

				except:
				print('Unexpected error')
				traceback.print_exc()


				### main script begins here ###
				if __name__ == '__main__':

				instList = None
				includeNonDefault = 0
				includeGeo = False
				iniFile = None
				overwrite = False
				mad3 = False
				expPath = None
				removeSummary = False
				skipMad3Download = False
				numCPU = multiprocessing.cpu_count()-2
				quiet = False

				mad3Url = 'http://cedar.openmadrigal.org'

				try:
				opts, args = getopt.getopt(sys.argv[1:], "h", ["inst=", "path=", "includeNonDefault", "overwrite", "ini=",
				"mad3", "includeGeo", "numCPU=", "help", "removeSummary",
				"skipMad3Download", "quiet"])
				except getopt.GetoptError as err:
				print(str(err))
				sys.exit(2)
				for o, a in opts:
				if o == "--inst":
				instItems = a.split(',')
				instList = []
				for inst in instItems:
				try:
				instList.append(int(inst))
				except:
				print(('--inst must be a comma delimited list of kinst (integers), not %s' % (a)))
				raise
				elif o == "--path":
				expPath = a
				if len(expPath) > 1 and expPath[-1] == '/':
				# strip off /
				expPath = expPath[:-1]
				if not os.access(expPath, os.R_OK):
				raise IOError('Unable to access path %s' % (expPath))
				elif o in ("-h", "--help"):
				print(usage)
				sys.exit(-1)
				elif o == '--includeNonDefault':
				includeNonDefault = 1
				elif o == '--includeGeo':
				includeGeo = True
				elif o == '--overwrite':
				overwrite = True
				elif o == '--mad3':
				mad3 = True
				elif o == '--ini':
				iniFile = a
				elif o == '--numCPU':
				numCPU = int(a)
				if numCPU < 1:
				raise ValueError('numCPU must be positive, not %i' % (numCPU))
				elif o == '--removeSummary':
				removeSummary = True
				elif o == '--skipMad3Download':
				skipMad3Download = True
				elif o == '--quiet':
				quiet = True
				else:
				assert False, "unhandled option"

				# get a list of all files to test for caching
				madDB = madrigal.metadata.MadrigalDB()
				fileList = madDB.getFileList(kinstList=instList, includeNonDefault=includeNonDefault,
				path=expPath)

				# possibly skip geophysical files
				geoList = [120, 210, 211, 212]

				expDict = {} # expDict - dict with keys = kinst, value = list of tuples of (exp sDT, eDT, expDirNum)
				# created only if needed as set by skipMad3Download

				filesToProcess = [] # the list to pass into the multiprocessing module to handle
				numCPU = min(max(1, multiprocessing.cpu_count()-2), numCPU)
				pool = multiprocessing.Pool(processes=numCPU)
				print(('Creating Cached Hdf5 files using %i cpu\'s' % (numCPU)))

				hdf5Exts = ('.h5', '.hdf5', '.hdf')
				madWebObj = madrigalWeb.madrigalWeb.MadrigalData(mad3Url)

				print('This next step may take a few hours....')
				for thisFile in fileList:
				# check expPath
				if expPath:
				if thisFile.find(expPath) == -1:
				continue

				# skip Hdf5 files here so things are faster for an almost competely converted Madrigal site
				base, ext = os.path.splitext(thisFile)
				if ext in hdf5Exts:
				continue

				expTab = os.path.join(os.path.dirname(thisFile), 'expTab.txt')
				madExpObj = madrigal.metadata.MadrigalExperiment(madDB, expTab)
				kinst = madExpObj.getKinstByPosition(0)
				if kinst in geoList and not includeGeo:
				continue

				if not skipMad3Download:
				if kinst not in list(expDict.keys()):
				expList = madWebObj.getExperiments(kinst, 1950, 1, 1, 0, 0, 0, 2020, 12, 31, 23, 59, 59)
				expList.sort()
				data = []
				for exp in expList:
				sDT = datetime.datetime(exp.startyear, exp.startmonth, exp.startday,
				exp.starthour, exp.startmin, exp.startsec)
				eDT = datetime.datetime(exp.endyear, exp.endmonth, exp.endday,
				exp.endhour, exp.endmin, exp.endsec)
				url = exp.url
				if url.find('experiments3') != -1:
				expDirNum = '3'
				elif url.find('experiments2') != -1:
				expDirNum = '2'
				else:
				expDirNum = ''
				data.append((sDT, eDT, expDirNum))
				expDict[kinst] = data

				sList = madExpObj.getExpStartDateTimeByPosition()[:6]
				sDT = datetime.datetime(*sList)
				eList = madExpObj.getExpEndDateTimeByPosition()[:6]
				eDT = datetime.datetime(*eList)
				mDT = sDT + (eDT - sDT)

				# loop through the experiments to get right experiment directory
				expDirNum = None
				for sDT, eDT, thisNum in expDict[kinst]:
				if sDT <= mDT and mDT <= eDT:
				expDirNum = thisNum
				break
				else:
				expDirNum = None


				filesToProcess.append((thisFile, iniFile, overwrite, mad3, removeSummary, skipMad3Download, expDirNum, quiet))


				# to better balance the load, apply random shuffle
				random.shuffle(filesToProcess)

				if len(filesToProcess):
				pool.map(createHdfFile, filesToProcess, 5)
				print('All HDF5 cached files successfully created')