DATABASES/madrigal Files · source/madpy/scripts/bin/createCachedFiles.py

Add BASE_URL in settings to work with proxys

jespinoza - - Load All Authors

File last commit:

r0:b84e1135c2c4


                r18:5a8055e18e7b

Download file

             createCachedFiles.py
        
                    189 lines
            
             | 7.3 KiB
            
                | text/x-python
            
             |
                PythonLexer
            
             / source / madpy / scripts / bin / createCachedFiles.py
          
                    History
                
                 |
                  Annotation
                 | Raw
                 |Copy content
                 |Copy permalink

      #!PYTHONEXE

      """createCachedFiles.py is a script that will create cached text or netCDF4 files to speed up

      user downloads.

      $Id: createCachedFiles.py 7119 2020-06-22 20:28:21Z brideout $

      """

      usage = """createCachedFiles.py [--excludeText --excludeNetCDF4 --inst=<instList> --kindat=<kindatList>  --path=<expPath> 

      --includeNonDefault  --overwrite --includeGeo --listOnly --numCPU=<numCPU> -h --help]

      By default, both text and netCDF4 files created.  Use --excludeText or --excludeNetCDF4 to only create one type.

      By default all instruments will be included.  Use --inst=<comma delimited kinst list> to only include some instruments.

      By default all kinds of data will be included.  Use --kindat=<comma delimited kindat list> to only include some kindats

      By default, all experiment directories will be included.  Use --path to limit to a particular directory and all subdirectories.

      By default only default files will be cached.  Use --includeNonDefault to include all files.

      Use --overwrite to overwrite all Hdf5 cached files. Default is to skip existing cached files.

      Set --includeGeo to also convert geophysical files. Default is to skip them.

      Set --listOnly to simply print cached files to be created

      numCPU by default is the maximum of (1, numCPUs available - 2).  Use --numCPU to override, but still will not be highter than default.

      -h or --help - print usage and exit

      """

      # standard python imports

      import os, os.path, sys

      import getopt

      import time

      import traceback

      import multiprocessing

      import subprocess

      # madrigal imports

      import madrigal.metadata

      import madrigal.cedar

      def createCachedFiles(args):

          """createCachedFiles is called for each file to be checked

          args = (filename, excludeText, excludeNetCDF4, overwrite, listOnly)

          # this code cannot raise an error

          """

          try:

              filename, excludeText, excludeNetCDF4, overwrite, listOnly = args

              if not listOnly:

                  print('working on %s' % (filename))

              sys.stdout.flush()

              madDB = madrigal.metadata.MadrigalDB()

              basename = os.path.basename(filename)

              expDir = os.path.dirname(filename)

              if not excludeText:

                  cachedFile = os.path.join(expDir, 'overview', basename + '.txt')

                  if not os.access(cachedFile + '.gz', os.R_OK) or overwrite:

                      if not listOnly:

                          madrigal.cedar.convertToText(filename, cachedFile)

                          subprocess.check_call(['gzip', '-f', cachedFile])

                      else:

                          print('ascii cache needed for %s' % (filename))

              if not excludeNetCDF4:

                  cachedFile = os.path.join(expDir, 'overview', basename + '.nc')

                  if not os.access(cachedFile, os.R_OK) or overwrite:

                      if not listOnly:

                          if os.access(cachedFile, os.R_OK):

                              os.remove(cachedFile)

                          try:

                              madrigal.cedar.convertToNetCDF4(filename, cachedFile)

                          except IOError:

                              cedarObj = madrigal.cedar.MadrigalCedarFile(filename)

                              cedarObj.write('netCDF4', cachedFile)

                      else:

                          print('netCDF4 cache needed for %s' % (filename))

          except:

              print('Unexpected error')

              traceback.print_exc()

      ### main script begins here ###

      if __name__ == '__main__':

          excludeText = False

          excludeNetCDF4 = False

          instList = None

          kindatList = None

          includeNonDefault = 0

          includeGeo = False

          overwrite = False

          expPath = None

          listOnly = False

          numCPU = multiprocessing.cpu_count()-2

          try:

              opts, args = getopt.getopt(sys.argv[1:], "h", ["excludeText", "excludeNetCDF4", "inst=", "kindat=", "path=", 

                                                             "includeNonDefault", "overwrite", "includeGeo", "numCPU=", 

                                                             "listOnly", "help"])

          except getopt.GetoptError as err:

              print(str(err)) 

              sys.exit(2)

          for o, a in opts:

              if o == '--excludeText':

                  excludeText = True

              elif o == '--excludeNetCDF4':

                  excludeNetCDF4 = True

              elif o == "--inst":

                  instItems = a.split(',')

                  instList = []

                  for inst in instItems:

                      try:

                          instList.append(int(inst))

                      except:

                          print(('--inst must be a comma delimited list of kinst (integers), not %s' % (a)))

                          raise

              elif o == "--kindat":

                  kindatItems = a.split(',')

                  kindatList = []

                  for kindat in kindatItems:

                      try:

                          kindatList.append(int(kindat))

                      except:

                          print(('--kindat must be a comma delimited list of kindat codes (integers), not %s' % (a)))

                          raise

              elif o == "--path":

                  expPath = a

                  if len(expPath) > 1 and expPath[-1] == '/':

                      # strip off /

                      expPath = expPath[:-1]

                  if not os.access(expPath, os.R_OK):

                      raise IOError('Unable to access path %s' % (expPath))

              elif o in ("-h", "--help"):

                  print(usage)

                  sys.exit(-1)

              elif o == '--includeNonDefault':

                  includeNonDefault = 1

              elif o == '--includeGeo':

                  includeGeo = True

              elif o == '--overwrite':

                  overwrite = True

              elif o == '--listOnly':

                  listOnly = True

              elif o == '--numCPU':

                  numCPU = int(a)

                  if numCPU < 1:

                      raise ValueError('numCPU must be positive, not %i' % (numCPU))

              else:

                  assert False, "unhandled option"

          if excludeText and excludeNetCDF4:

              print('Nothing to be done since both text and netCDF4 cached files excluded')

              sys.exit(0)

          # get a list of all files to test for caching

          madDB = madrigal.metadata.MadrigalDB()

          fileList = madDB.getFileList(kinstList=instList, kindatList=kindatList, includeNonDefault=includeNonDefault,

                                       path=expPath)

          # possibly skip geophysical files

          geoList = [120, 210, 211, 212]

          filesToProcess = [] # the list to pass into the multiprocessing module to handle

          numCPU = min(max(1, multiprocessing.cpu_count()-2), numCPU)

          pool = multiprocessing.Pool(processes=numCPU) 

          print(('Creating cached files using %i cpu\'s' % (numCPU)))

          print('This next step may take a few hours....')

          for thisFile in fileList:

              # check expPath

              if expPath:

                  if thisFile.find(expPath) == -1:

                      continue

              expTab = os.path.join(os.path.dirname(thisFile), 'expTab.txt')

              madExpObj = madrigal.metadata.MadrigalExperiment(madDB, expTab)

              kinst = madExpObj.getKinstByPosition(0)

              if kinst in geoList and not includeGeo:

                  continue

              filesToProcess.append((thisFile, excludeText, excludeNetCDF4, overwrite, listOnly))

              if len(filesToProcess) > 200:

                  pool.map(createCachedFiles, filesToProcess)

                  filesToProcess = []

          # get all remaining

          if len(filesToProcess):

              pool.map(createCachedFiles, filesToProcess)

          print('All cached files successfully created')

	Site-wide shortcuts
/	Use quick search box
g h	Goto home page
g g	Goto my private gists page
g G	Goto my public gists page
g 0-9	Goto bookmarked items from 0-9
n r	New repository page
n g	New gist page

	Repositories
g s	Goto summary page
g c	Goto changelog page
g f	Goto files page
g F	Goto files page with file search activated
g p	Goto pull requests page
g o	Goto repository settings
g O	Goto repository access permissions settings
t s	Toggle sidebar on some pages

				#!PYTHONEXE

				"""createCachedFiles.py is a script that will create cached text or netCDF4 files to speed up
				user downloads.

				$Id: createCachedFiles.py 7119 2020-06-22 20:28:21Z brideout $
				"""

				usage = """createCachedFiles.py [--excludeText --excludeNetCDF4 --inst=<instList> --kindat=<kindatList> --path=<expPath>
				--includeNonDefault --overwrite --includeGeo --listOnly --numCPU=<numCPU> -h --help]
				By default, both text and netCDF4 files created. Use --excludeText or --excludeNetCDF4 to only create one type.
				By default all instruments will be included. Use --inst=<comma delimited kinst list> to only include some instruments.
				By default all kinds of data will be included. Use --kindat=<comma delimited kindat list> to only include some kindats
				By default, all experiment directories will be included. Use --path to limit to a particular directory and all subdirectories.
				By default only default files will be cached. Use --includeNonDefault to include all files.
				Use --overwrite to overwrite all Hdf5 cached files. Default is to skip existing cached files.
				Set --includeGeo to also convert geophysical files. Default is to skip them.
				Set --listOnly to simply print cached files to be created
				numCPU by default is the maximum of (1, numCPUs available - 2). Use --numCPU to override, but still will not be highter than default.
				-h or --help - print usage and exit
				"""

				# standard python imports
				import os, os.path, sys
				import getopt
				import time
				import traceback
				import multiprocessing
				import subprocess

				# madrigal imports
				import madrigal.metadata
				import madrigal.cedar



				def createCachedFiles(args):
				"""createCachedFiles is called for each file to be checked

				args = (filename, excludeText, excludeNetCDF4, overwrite, listOnly)
				# this code cannot raise an error
				"""
				try:
				filename, excludeText, excludeNetCDF4, overwrite, listOnly = args
				if not listOnly:
				print('working on %s' % (filename))
				sys.stdout.flush()
				madDB = madrigal.metadata.MadrigalDB()
				basename = os.path.basename(filename)
				expDir = os.path.dirname(filename)

				if not excludeText:
				cachedFile = os.path.join(expDir, 'overview', basename + '.txt')
				if not os.access(cachedFile + '.gz', os.R_OK) or overwrite:
				if not listOnly:
				madrigal.cedar.convertToText(filename, cachedFile)
				subprocess.check_call(['gzip', '-f', cachedFile])
				else:
				print('ascii cache needed for %s' % (filename))

				if not excludeNetCDF4:
				cachedFile = os.path.join(expDir, 'overview', basename + '.nc')
				if not os.access(cachedFile, os.R_OK) or overwrite:
				if not listOnly:
				if os.access(cachedFile, os.R_OK):
				os.remove(cachedFile)
				try:
				madrigal.cedar.convertToNetCDF4(filename, cachedFile)
				except IOError:
				cedarObj = madrigal.cedar.MadrigalCedarFile(filename)
				cedarObj.write('netCDF4', cachedFile)
				else:
				print('netCDF4 cache needed for %s' % (filename))

				except:
				print('Unexpected error')
				traceback.print_exc()


				### main script begins here ###
				if __name__ == '__main__':

				excludeText = False
				excludeNetCDF4 = False
				instList = None
				kindatList = None
				includeNonDefault = 0
				includeGeo = False
				overwrite = False
				expPath = None
				listOnly = False
				numCPU = multiprocessing.cpu_count()-2

				try:
				opts, args = getopt.getopt(sys.argv[1:], "h", ["excludeText", "excludeNetCDF4", "inst=", "kindat=", "path=",
				"includeNonDefault", "overwrite", "includeGeo", "numCPU=",
				"listOnly", "help"])
				except getopt.GetoptError as err:
				print(str(err))
				sys.exit(2)
				for o, a in opts:
				if o == '--excludeText':
				excludeText = True
				elif o == '--excludeNetCDF4':
				excludeNetCDF4 = True
				elif o == "--inst":
				instItems = a.split(',')
				instList = []
				for inst in instItems:
				try:
				instList.append(int(inst))
				except:
				print(('--inst must be a comma delimited list of kinst (integers), not %s' % (a)))
				raise
				elif o == "--kindat":
				kindatItems = a.split(',')
				kindatList = []
				for kindat in kindatItems:
				try:
				kindatList.append(int(kindat))
				except:
				print(('--kindat must be a comma delimited list of kindat codes (integers), not %s' % (a)))
				raise
				elif o == "--path":
				expPath = a
				if len(expPath) > 1 and expPath[-1] == '/':
				# strip off /
				expPath = expPath[:-1]
				if not os.access(expPath, os.R_OK):
				raise IOError('Unable to access path %s' % (expPath))
				elif o in ("-h", "--help"):
				print(usage)
				sys.exit(-1)
				elif o == '--includeNonDefault':
				includeNonDefault = 1
				elif o == '--includeGeo':
				includeGeo = True
				elif o == '--overwrite':
				overwrite = True
				elif o == '--listOnly':
				listOnly = True
				elif o == '--numCPU':
				numCPU = int(a)
				if numCPU < 1:
				raise ValueError('numCPU must be positive, not %i' % (numCPU))
				else:
				assert False, "unhandled option"

				if excludeText and excludeNetCDF4:
				print('Nothing to be done since both text and netCDF4 cached files excluded')
				sys.exit(0)

				# get a list of all files to test for caching
				madDB = madrigal.metadata.MadrigalDB()
				fileList = madDB.getFileList(kinstList=instList, kindatList=kindatList, includeNonDefault=includeNonDefault,
				path=expPath)

				# possibly skip geophysical files
				geoList = [120, 210, 211, 212]

				filesToProcess = [] # the list to pass into the multiprocessing module to handle
				numCPU = min(max(1, multiprocessing.cpu_count()-2), numCPU)
				pool = multiprocessing.Pool(processes=numCPU)
				print(('Creating cached files using %i cpu\'s' % (numCPU)))

				print('This next step may take a few hours....')
				for thisFile in fileList:
				# check expPath
				if expPath:
				if thisFile.find(expPath) == -1:
				continue


				expTab = os.path.join(os.path.dirname(thisFile), 'expTab.txt')
				madExpObj = madrigal.metadata.MadrigalExperiment(madDB, expTab)
				kinst = madExpObj.getKinstByPosition(0)
				if kinst in geoList and not includeGeo:
				continue

				filesToProcess.append((thisFile, excludeText, excludeNetCDF4, overwrite, listOnly))

				if len(filesToProcess) > 200:
				pool.map(createCachedFiles, filesToProcess)
				filesToProcess = []

				# get all remaining
				if len(filesToProcess):
				pool.map(createCachedFiles, filesToProcess)
				print('All cached files successfully created')