DATABASES/madrigal Files · source/madpy/scripts/bin/createCachedHdf5Files.py

Add BASE_URL in settings to work with proxys

jespinoza - - Load All Authors

File last commit:

r0:b84e1135c2c4


                r18:5a8055e18e7b

Download file

             createCachedHdf5Files.py
        
                    320 lines
            
             | 13.0 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / source / madpy / scripts / bin / createCachedHdf5Files.py
          
                    History
                
                 |
                  Source
                 | Raw
                 |Copy content
                 |Copy permalink

        jespinoza
    
Initial

              r0
            
      #!PYTHONEXE

      """createCachedHdf5Files.py is a script that will walk all the files in a Madrigal database, and make sure all

      files in the old Cedar 2.X format have cached hdf5 versions.  Used only when updating from Madrigal 3 to 

      Madrigal 3.

      $Id: createCachedHdf5Files.py 7119 2020-06-22 20:28:21Z brideout $

      """

      usage = """createCachedHdf5Files.py [--inst=<instList>  --path=<expPath> --includeNonDefault --ini=<iniFile> --mad3 --overwrite

      --includeGeo --numCPU=<numCPU> -h --help --removeSummary --skipMad3Download]

      By default all instruments will be included.  Use --inst=<comma delimited kinst list> to only include some instruments.

      By default, all experiment directories will be included.  Use --path to limit to a particular directory and all subdirectories.

      By default only default files will be cached.  Use --includeNonDefault to include all files.

      By default, extra parameters and formats are added by the ini file $MADROOT/cachedFiles.ini.  Use

          --ini=<iniFile> to specify an alternative ini file.  See madrigal.data.MadrigalFile._parseCachedIni for description of

          the ini file format.  Set --includeGeo to also convert geophysical files

      Use --overwrite to overwrite all Hdf5 cached files

      Use --mad3 to overwrite all non Madrigal3 Hdf5 files

      numCPU by default is the maximum of (1, numCPUs available - 2).  Use --numCPU to override, but still will not be highter than default.

      Use --removeSummary to remove summary files before creating Hdf5 files

      Use --skipMad3Download to not try to dowload file from madrigal3.haystack.mit.edu

      -h or --help - print usage and exit

      """

      import os, os.path, sys

      import getopt

      import time, datetime

      import traceback

      import multiprocessing

      import warnings

      import random

      import h5py

      import madrigal.metadata

      import madrigal.data

      import madrigalWeb.madrigalWeb

      def downloadMad3File(filename, mad3Url, expDirNum):

          """downloadMad3File downloads the appropriate Madrigal 3 Hdf5 associated with filename if possible

          from mad3Url. Also downloads the summary file.

          Inputs:

              filename - full path to filename on present Madrigal2 server

              mad3Url - url of Madrigal 3 CEDAR Madrigal server to get hdf5 version from

              expDirNum - either '' or '3' - suggestion as to what experiment directory to try first.

          Returns True if success, False if not.

          """

          user_fullname = 'Bill Rideout' 

          user_email = 'brideout@haystack.mit.edu' 

          user_affiliation = 'MIT' 

          format='hdf5'

          madroot = '/opt/madrigal3'

          madWebObj = madrigalWeb.madrigalWeb.MadrigalData(mad3Url)

          expDir = os.path.dirname(filename)

          basename = os.path.basename(filename)

          # take into account that CEDAR madrigal server has both experiments and experiments3

          remoteFileList = [os.path.join(madroot, filename[filename.find('experiments'):]) + '.hdf5']

          remoteFileList.append(remoteFileList[-1].replace('experiments/', 'experiments3/'))

          remoteSummaryFileList = [os.path.join(madroot, expDir[expDir.find('experiments'):], 'overview', basename + '.hdf5.summary')]

          remoteSummaryFileList.append(remoteSummaryFileList[-1].replace('experiments/', 'experiments3/'))

          if expDirNum == '3':

              # reverse both lists to try experiments3 first

              remoteFileList.reverse()

              remoteSummaryFileList.reverse()

          destDir = os.path.join(os.path.dirname(filename), 'overview')

          destination = os.path.join(destDir, os.path.basename(filename) + '.hdf5')

          summDest = destination + '.summary'

          for i in range(len(remoteFileList)):

              remoteFile = remoteFileList[i]

              remoteSummaryFile = remoteSummaryFileList[i]

              try:

                  madWebObj.downloadFile(remoteFile, destination, user_fullname, user_email, user_affiliation, 

                                         format)

                  madWebObj.downloadFile(remoteSummaryFile, summDest, user_fullname, user_email, user_affiliation, 

                                         format)

                  return(True)

              except:

                  if i < len(remoteFileList) - 1:

                      continue

                  else:

                      print(('Failed to download any of %s' % (str(remoteFileList))))

                      return(False)

      def createHdfFile(args):

          # this code cannot raise an error

          try:

              with warnings.catch_warnings():

                  # we know we are calling deprecated code - surpress warnings

                  warnings.simplefilter("ignore")

                  filename, iniFile, overwrite, mad3, removeSummary, skipMad3Download, expDirNum,  quiet = args

                  madDB = madrigal.metadata.MadrigalDB()

                  # skip if already Hdf5

                  fileName, fileExtension = os.path.splitext(filename)

                  if fileExtension in ('.h5', '.hdf5', '.hdf'):

                      if not quiet:

                          print(('skipping %s because already Hdf5' % (filename)))

                      return

                  hdf5Name = os.path.join(os.path.dirname(filename), 'overview', os.path.basename(filename) + '.hdf5')

                  if os.access(hdf5Name, os.R_OK) and not overwrite:

                      if not mad3:

                          return

                      else:

                          # check if already Madrigal3

                          try:

                              f = h5py.File(hdf5Name, 'r')

                              if '_record_layout' in list(f['Metadata'].keys()):

                                  f.close()

                                  if not quiet:

                                      print(('skipping %s because cached file already Madrigal3' % (filename)))

                                  return

                              else:

                                  print(('overwriting cached file for %s because not Madrigal3' % (filename)))

                                  f.close()

                                  os.remove(hdf5Name)

                          except:

                              traceback.print_exc()

                              try:

                                  f.close()

                              except:

                                  pass

                              print(('Problem with cached file for %s - removing' % (filename)))

                              os.remove(hdf5Name)

                  if removeSummary:

                      summaryFile = os.path.join(os.path.dirname(filename), 'overview', 

                                                 os.path.basename(filename) + '.summary')

                      try:

                          if not quiet:

                              print(('removing summary %s' % (summaryFile)))

                          os.remove(summaryFile)

                      except:

                          pass

                  # first try to download file from mad3Url if not skipMad3Download

                  result = False

                  if not skipMad3Download:

                      result = downloadMad3File(filename, mad3Url, expDirNum)

                      if result:

                          print(('Downloaded cached and summary file for %s from Mad3 CEDAR Madrigal site' % (filename)))

                  if not result:

                      # this Hdf5 file needs to be created

                      print(('creating hdf5 file for %s' % (filename)))

                      try:

                          madFileObj = madrigal.data.MadrigalFile(filename, madDB)

                          madFileObj.getCachedHdf5(iniFile, overwrite, showWarnings=True)

                      except:

                          traceback.print_exc()

          except:

              print('Unexpected error')

              traceback.print_exc()

      ### main script begins here ###

      if __name__ == '__main__':

          instList = None

          includeNonDefault = 0

          includeGeo = False

          iniFile = None

          overwrite = False

          mad3 = False

          expPath = None

          removeSummary = False

          skipMad3Download = False

          numCPU = multiprocessing.cpu_count()-2

          quiet = False

          mad3Url = 'http://cedar.openmadrigal.org'

          try:

              opts, args = getopt.getopt(sys.argv[1:], "h", ["inst=", "path=", "includeNonDefault", "overwrite", "ini=", 

                                                             "mad3", "includeGeo", "numCPU=", "help", "removeSummary",

                                                             "skipMad3Download", "quiet"])

          except getopt.GetoptError as err:

              print(str(err)) 

              sys.exit(2)

          for o, a in opts:

              if o == "--inst":

                  instItems = a.split(',')

                  instList = []

                  for inst in instItems:

                      try:

                          instList.append(int(inst))

                      except:

                          print(('--inst must be a comma delimited list of kinst (integers), not %s' % (a)))

                          raise

              elif o == "--path":

                  expPath = a

                  if len(expPath) > 1 and expPath[-1] == '/':

                      # strip off /

                      expPath = expPath[:-1]

                  if not os.access(expPath, os.R_OK):

                      raise IOError('Unable to access path %s' % (expPath))

              elif o in ("-h", "--help"):

                  print(usage)

                  sys.exit(-1)

              elif o == '--includeNonDefault':

                  includeNonDefault = 1

              elif o == '--includeGeo':

                  includeGeo = True

              elif o == '--overwrite':

                  overwrite = True

              elif o == '--mad3':

                  mad3 = True

              elif o == '--ini':

                  iniFile = a

              elif o == '--numCPU':

                  numCPU = int(a)

                  if numCPU < 1:

                      raise ValueError('numCPU must be positive, not %i' % (numCPU))

              elif o == '--removeSummary':

                  removeSummary = True

              elif o == '--skipMad3Download':

                  skipMad3Download = True

              elif o == '--quiet':

                  quiet = True

              else:

                  assert False, "unhandled option"

          # get a list of all files to test for caching

          madDB = madrigal.metadata.MadrigalDB()

          fileList = madDB.getFileList(kinstList=instList, includeNonDefault=includeNonDefault,

                                       path=expPath)

          # possibly skip geophysical files

          geoList = [120, 210, 211, 212]

          expDict = {} # expDict - dict with keys = kinst, value = list of tuples of (exp sDT, eDT, expDirNum)

                       # created only if needed as set by skipMad3Download

          filesToProcess = [] # the list to pass into the multiprocessing module to handle

          numCPU = min(max(1, multiprocessing.cpu_count()-2), numCPU)

          pool = multiprocessing.Pool(processes=numCPU) 

          print(('Creating Cached Hdf5 files using %i cpu\'s' % (numCPU)))

          hdf5Exts = ('.h5', '.hdf5', '.hdf')

          madWebObj = madrigalWeb.madrigalWeb.MadrigalData(mad3Url)

          print('This next step may take a few hours....')

          for thisFile in fileList:

              # check expPath

              if expPath:

                  if thisFile.find(expPath) == -1:

                      continue

              # skip Hdf5 files here so things are faster for an almost competely converted Madrigal site

              base, ext = os.path.splitext(thisFile)

              if ext in hdf5Exts:

                  continue

              expTab = os.path.join(os.path.dirname(thisFile), 'expTab.txt')

              madExpObj = madrigal.metadata.MadrigalExperiment(madDB, expTab)

              kinst = madExpObj.getKinstByPosition(0)

              if kinst in geoList and not includeGeo:

                  continue

              if not skipMad3Download:

                  if kinst not in list(expDict.keys()):

                      expList = madWebObj.getExperiments(kinst, 1950, 1, 1, 0, 0, 0, 2020, 12, 31, 23, 59, 59)

                      expList.sort()

                      data = []

                      for exp in expList:

                          sDT = datetime.datetime(exp.startyear, exp.startmonth, exp.startday,

                                                  exp.starthour, exp.startmin, exp.startsec)

                          eDT = datetime.datetime(exp.endyear, exp.endmonth, exp.endday,

                                                  exp.endhour, exp.endmin, exp.endsec)

                          url = exp.url

                          if url.find('experiments3') != -1:

                              expDirNum = '3'

                          elif url.find('experiments2') != -1:

                              expDirNum = '2'

                          else:

                              expDirNum = ''

                          data.append((sDT, eDT, expDirNum))

                      expDict[kinst] = data

                  sList = madExpObj.getExpStartDateTimeByPosition()[:6]

                  sDT = datetime.datetime(*sList)

                  eList = madExpObj.getExpEndDateTimeByPosition()[:6]

                  eDT = datetime.datetime(*eList)

                  mDT = sDT + (eDT - sDT)

                  # loop through the experiments to get right experiment directory

                  expDirNum = None

                  for sDT, eDT, thisNum in expDict[kinst]:

                      if sDT <= mDT and mDT <= eDT:

                          expDirNum = thisNum

                          break

              else:

                  expDirNum = None

              filesToProcess.append((thisFile, iniFile, overwrite, mad3, removeSummary, skipMad3Download, expDirNum, quiet))

          # to better balance the load, apply random shuffle

          random.shuffle(filesToProcess)

          if len(filesToProcess):

              pool.map(createHdfFile, filesToProcess, 5)

          print('All HDF5 cached files successfully created')

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

jespinoza Initial	r0	#!PYTHONEXE

		"""createCachedHdf5Files.py is a script that will walk all the files in a Madrigal database, and make sure all
		files in the old Cedar 2.X format have cached hdf5 versions. Used only when updating from Madrigal 3 to
		Madrigal 3.

		$Id: createCachedHdf5Files.py 7119 2020-06-22 20:28:21Z brideout $
		"""

		usage = """createCachedHdf5Files.py [--inst=<instList> --path=<expPath> --includeNonDefault --ini=<iniFile> --mad3 --overwrite
		--includeGeo --numCPU=<numCPU> -h --help --removeSummary --skipMad3Download]
		By default all instruments will be included. Use --inst=<comma delimited kinst list> to only include some instruments.
		By default, all experiment directories will be included. Use --path to limit to a particular directory and all subdirectories.
		By default only default files will be cached. Use --includeNonDefault to include all files.
		By default, extra parameters and formats are added by the ini file $MADROOT/cachedFiles.ini. Use
		--ini=<iniFile> to specify an alternative ini file. See madrigal.data.MadrigalFile._parseCachedIni for description of
		the ini file format. Set --includeGeo to also convert geophysical files
		Use --overwrite to overwrite all Hdf5 cached files
		Use --mad3 to overwrite all non Madrigal3 Hdf5 files
		numCPU by default is the maximum of (1, numCPUs available - 2). Use --numCPU to override, but still will not be highter than default.
		Use --removeSummary to remove summary files before creating Hdf5 files
		Use --skipMad3Download to not try to dowload file from madrigal3.haystack.mit.edu
		-h or --help - print usage and exit
		"""

		import os, os.path, sys
		import getopt
		import time, datetime
		import traceback
		import multiprocessing
		import warnings
		import random

		import h5py

		import madrigal.metadata
		import madrigal.data
		import madrigalWeb.madrigalWeb




		def downloadMad3File(filename, mad3Url, expDirNum):
		"""downloadMad3File downloads the appropriate Madrigal 3 Hdf5 associated with filename if possible
		from mad3Url. Also downloads the summary file.

		Inputs:
		filename - full path to filename on present Madrigal2 server
		mad3Url - url of Madrigal 3 CEDAR Madrigal server to get hdf5 version from
		expDirNum - either '' or '3' - suggestion as to what experiment directory to try first.

		Returns True if success, False if not.
		"""
		user_fullname = 'Bill Rideout'
		user_email = 'brideout@haystack.mit.edu'
		user_affiliation = 'MIT'
		format='hdf5'
		madroot = '/opt/madrigal3'
		madWebObj = madrigalWeb.madrigalWeb.MadrigalData(mad3Url)
		expDir = os.path.dirname(filename)
		basename = os.path.basename(filename)

		# take into account that CEDAR madrigal server has both experiments and experiments3
		remoteFileList = [os.path.join(madroot, filename[filename.find('experiments'):]) + '.hdf5']
		remoteFileList.append(remoteFileList[-1].replace('experiments/', 'experiments3/'))

		remoteSummaryFileList = [os.path.join(madroot, expDir[expDir.find('experiments'):], 'overview', basename + '.hdf5.summary')]
		remoteSummaryFileList.append(remoteSummaryFileList[-1].replace('experiments/', 'experiments3/'))

		if expDirNum == '3':
		# reverse both lists to try experiments3 first
		remoteFileList.reverse()
		remoteSummaryFileList.reverse()

		destDir = os.path.join(os.path.dirname(filename), 'overview')
		destination = os.path.join(destDir, os.path.basename(filename) + '.hdf5')
		summDest = destination + '.summary'

		for i in range(len(remoteFileList)):
		remoteFile = remoteFileList[i]
		remoteSummaryFile = remoteSummaryFileList[i]
		try:
		madWebObj.downloadFile(remoteFile, destination, user_fullname, user_email, user_affiliation,
		format)
		madWebObj.downloadFile(remoteSummaryFile, summDest, user_fullname, user_email, user_affiliation,
		format)
		return(True)
		except:
		if i < len(remoteFileList) - 1:
		continue
		else:
		print(('Failed to download any of %s' % (str(remoteFileList))))
		return(False)







		def createHdfFile(args):
		# this code cannot raise an error
		try:
		with warnings.catch_warnings():
		# we know we are calling deprecated code - surpress warnings
		warnings.simplefilter("ignore")
		filename, iniFile, overwrite, mad3, removeSummary, skipMad3Download, expDirNum, quiet = args
		madDB = madrigal.metadata.MadrigalDB()
		# skip if already Hdf5
		fileName, fileExtension = os.path.splitext(filename)
		if fileExtension in ('.h5', '.hdf5', '.hdf'):
		if not quiet:
		print(('skipping %s because already Hdf5' % (filename)))
		return
		hdf5Name = os.path.join(os.path.dirname(filename), 'overview', os.path.basename(filename) + '.hdf5')
		if os.access(hdf5Name, os.R_OK) and not overwrite:
		if not mad3:
		return
		else:
		# check if already Madrigal3
		try:
		f = h5py.File(hdf5Name, 'r')
		if '_record_layout' in list(f['Metadata'].keys()):
		f.close()
		if not quiet:
		print(('skipping %s because cached file already Madrigal3' % (filename)))
		return
		else:
		print(('overwriting cached file for %s because not Madrigal3' % (filename)))
		f.close()
		os.remove(hdf5Name)
		except:
		traceback.print_exc()
		try:
		f.close()
		except:
		pass
		print(('Problem with cached file for %s - removing' % (filename)))
		os.remove(hdf5Name)

		if removeSummary:
		summaryFile = os.path.join(os.path.dirname(filename), 'overview',
		os.path.basename(filename) + '.summary')
		try:
		if not quiet:
		print(('removing summary %s' % (summaryFile)))
		os.remove(summaryFile)
		except:
		pass

		# first try to download file from mad3Url if not skipMad3Download
		result = False
		if not skipMad3Download:
		result = downloadMad3File(filename, mad3Url, expDirNum)
		if result:
		print(('Downloaded cached and summary file for %s from Mad3 CEDAR Madrigal site' % (filename)))

		if not result:
		# this Hdf5 file needs to be created
		print(('creating hdf5 file for %s' % (filename)))
		try:
		madFileObj = madrigal.data.MadrigalFile(filename, madDB)
		madFileObj.getCachedHdf5(iniFile, overwrite, showWarnings=True)
		except:
		traceback.print_exc()

		except:
		print('Unexpected error')
		traceback.print_exc()


		### main script begins here ###
		if __name__ == '__main__':

		instList = None
		includeNonDefault = 0
		includeGeo = False
		iniFile = None
		overwrite = False
		mad3 = False
		expPath = None
		removeSummary = False
		skipMad3Download = False
		numCPU = multiprocessing.cpu_count()-2
		quiet = False

		mad3Url = 'http://cedar.openmadrigal.org'

		try:
		opts, args = getopt.getopt(sys.argv[1:], "h", ["inst=", "path=", "includeNonDefault", "overwrite", "ini=",
		"mad3", "includeGeo", "numCPU=", "help", "removeSummary",
		"skipMad3Download", "quiet"])
		except getopt.GetoptError as err:
		print(str(err))
		sys.exit(2)
		for o, a in opts:
		if o == "--inst":
		instItems = a.split(',')
		instList = []
		for inst in instItems:
		try:
		instList.append(int(inst))
		except:
		print(('--inst must be a comma delimited list of kinst (integers), not %s' % (a)))
		raise
		elif o == "--path":
		expPath = a
		if len(expPath) > 1 and expPath[-1] == '/':
		# strip off /
		expPath = expPath[:-1]
		if not os.access(expPath, os.R_OK):
		raise IOError('Unable to access path %s' % (expPath))
		elif o in ("-h", "--help"):
		print(usage)
		sys.exit(-1)
		elif o == '--includeNonDefault':
		includeNonDefault = 1
		elif o == '--includeGeo':
		includeGeo = True
		elif o == '--overwrite':
		overwrite = True
		elif o == '--mad3':
		mad3 = True
		elif o == '--ini':
		iniFile = a
		elif o == '--numCPU':
		numCPU = int(a)
		if numCPU < 1:
		raise ValueError('numCPU must be positive, not %i' % (numCPU))
		elif o == '--removeSummary':
		removeSummary = True
		elif o == '--skipMad3Download':
		skipMad3Download = True
		elif o == '--quiet':
		quiet = True
		else:
		assert False, "unhandled option"

		# get a list of all files to test for caching
		madDB = madrigal.metadata.MadrigalDB()
		fileList = madDB.getFileList(kinstList=instList, includeNonDefault=includeNonDefault,
		path=expPath)

		# possibly skip geophysical files
		geoList = [120, 210, 211, 212]

		expDict = {} # expDict - dict with keys = kinst, value = list of tuples of (exp sDT, eDT, expDirNum)
		# created only if needed as set by skipMad3Download

		filesToProcess = [] # the list to pass into the multiprocessing module to handle
		numCPU = min(max(1, multiprocessing.cpu_count()-2), numCPU)
		pool = multiprocessing.Pool(processes=numCPU)
		print(('Creating Cached Hdf5 files using %i cpu\'s' % (numCPU)))

		hdf5Exts = ('.h5', '.hdf5', '.hdf')
		madWebObj = madrigalWeb.madrigalWeb.MadrigalData(mad3Url)

		print('This next step may take a few hours....')
		for thisFile in fileList:
		# check expPath
		if expPath:
		if thisFile.find(expPath) == -1:
		continue

		# skip Hdf5 files here so things are faster for an almost competely converted Madrigal site
		base, ext = os.path.splitext(thisFile)
		if ext in hdf5Exts:
		continue

		expTab = os.path.join(os.path.dirname(thisFile), 'expTab.txt')
		madExpObj = madrigal.metadata.MadrigalExperiment(madDB, expTab)
		kinst = madExpObj.getKinstByPosition(0)
		if kinst in geoList and not includeGeo:
		continue

		if not skipMad3Download:
		if kinst not in list(expDict.keys()):
		expList = madWebObj.getExperiments(kinst, 1950, 1, 1, 0, 0, 0, 2020, 12, 31, 23, 59, 59)
		expList.sort()
		data = []
		for exp in expList:
		sDT = datetime.datetime(exp.startyear, exp.startmonth, exp.startday,
		exp.starthour, exp.startmin, exp.startsec)
		eDT = datetime.datetime(exp.endyear, exp.endmonth, exp.endday,
		exp.endhour, exp.endmin, exp.endsec)
		url = exp.url
		if url.find('experiments3') != -1:
		expDirNum = '3'
		elif url.find('experiments2') != -1:
		expDirNum = '2'
		else:
		expDirNum = ''
		data.append((sDT, eDT, expDirNum))
		expDict[kinst] = data

		sList = madExpObj.getExpStartDateTimeByPosition()[:6]
		sDT = datetime.datetime(*sList)
		eList = madExpObj.getExpEndDateTimeByPosition()[:6]
		eDT = datetime.datetime(*eList)
		mDT = sDT + (eDT - sDT)

		# loop through the experiments to get right experiment directory
		expDirNum = None
		for sDT, eDT, thisNum in expDict[kinst]:
		if sDT <= mDT and mDT <= eDT:
		expDirNum = thisNum
		break
		else:
		expDirNum = None


		filesToProcess.append((thisFile, iniFile, overwrite, mad3, removeSummary, skipMad3Download, expDirNum, quiet))


		# to better balance the load, apply random shuffle
		random.shuffle(filesToProcess)

		if len(filesToProcess):
		pool.map(createHdfFile, filesToProcess, 5)
		print('All HDF5 cached files successfully created')