##// END OF EJS Templates

File last commit:

r608:3cc55c179318
r609:d72f625d344c
Show More
jroIO_HDF5.py
1005 lines | 30.0 KiB | text/x-python | PythonLexer
import numpy
import time
import os
import h5py
import re
import tables
from schainpy.model.data.jrodata import *
from schainpy.model.proc.jroproc_base import ProcessingUnit, Operation
from schainpy.model.io.jroIO_base import *
class HDF5Reader(ProcessingUnit):
ext = ".hdf5"
optchar = "D"
timezone = None
secStart = None
secEnd = None
fileIndex = None
blockIndex = None
blocksPerFile = None
path = None
#List of Files
filenameList = None
datetimeList = None
#Hdf5 File
fpMetadata = None
pathMeta = None
listMetaname = None
listMeta = None
listDataname = None
listData = None
listShapes = None
fp = None
#dataOut reconstruction
dataOut = None
nRecords = None
def __init__(self):
self.dataOut = self.__createObjByDefault()
return
def __createObjByDefault(self):
dataObj = Parameters()
return dataObj
def setup(self,path=None,
startDate=None,
endDate=None,
startTime=datetime.time(0,0,0),
endTime=datetime.time(23,59,59),
walk=True,
timezone='ut',
all=0,
online=False,
ext=None):
if ext==None:
ext = self.ext
self.timezone = timezone
# self.all = all
# self.online = online
self.path = path
startDateTime = datetime.datetime.combine(startDate,startTime)
endDateTime = datetime.datetime.combine(endDate,endTime)
secStart = (startDateTime-datetime.datetime(1970,1,1)).total_seconds()
secEnd = (endDateTime-datetime.datetime(1970,1,1)).total_seconds()
self.secStart = secStart
self.secEnd = secEnd
if not(online):
#Busqueda de archivos offline
self.__searchFilesOffline(path, startDate, endDate, ext, startTime, endTime, secStart, secEnd, walk)
else:
self.__searchFilesOnline(path, walk)
if not(self.filenameList):
print "There is no files into the folder: %s"%(path)
sys.exit(-1)
# self.__getExpParameters()
self.fileIndex = -1
self.__setNextFileOffline()
self.__readMetadata()
self.blockIndex = 0
return
def __searchFilesOffline(self,
path,
startDate,
endDate,
ext,
startTime=datetime.time(0,0,0),
endTime=datetime.time(23,59,59),
secStart = 0,
secEnd = numpy.inf,
walk=True):
# self.__setParameters(path, startDate, endDate, startTime, endTime, walk)
#
# self.__checkPath()
#
# self.__findDataForDates()
#
# self.__selectDataForTimes()
#
# for i in range(len(self.filenameList)):
# print "%s" %(self.filenameList[i])
pathList = []
if not walk:
#pathList.append(path)
multi_path = path.split(',')
for single_path in multi_path:
pathList.append(single_path)
else:
#dirList = []
multi_path = path.split(',')
for single_path in multi_path:
dirList = []
for thisPath in os.listdir(single_path):
if not os.path.isdir(os.path.join(single_path,thisPath)):
continue
if not isDoyFolder(thisPath):
continue
dirList.append(thisPath)
if not(dirList):
return None, None
thisDate = startDate
while(thisDate <= endDate):
year = thisDate.timetuple().tm_year
doy = thisDate.timetuple().tm_yday
matchlist = fnmatch.filter(dirList, '?' + '%4.4d%3.3d' % (year,doy) + '*')
if len(matchlist) == 0:
thisDate += datetime.timedelta(1)
continue
for match in matchlist:
pathList.append(os.path.join(single_path,match))
thisDate += datetime.timedelta(1)
if pathList == []:
print "Any folder was found for the date range: %s-%s" %(startDate, endDate)
return None, None
print "%d folder(s) was(were) found for the date range: %s - %s" %(len(pathList), startDate, endDate)
filenameList = []
datetimeList = []
pathDict = {}
filenameList_to_sort = []
for i in range(len(pathList)):
thisPath = pathList[i]
fileList = glob.glob1(thisPath, "*%s" %ext)
fileList.sort()
pathDict.setdefault(fileList[0])
pathDict[fileList[0]] = i
filenameList_to_sort.append(fileList[0])
filenameList_to_sort.sort()
for file in filenameList_to_sort:
thisPath = pathList[pathDict[file]]
fileList = glob.glob1(thisPath, "*%s" %ext)
fileList.sort()
for file in fileList:
filename = os.path.join(thisPath,file)
thisDatetime = self.__isFileinThisTime(filename, secStart, secEnd)
if not(thisDatetime):
continue
filenameList.append(filename)
datetimeList.append(thisDatetime)
if not(filenameList):
print "Any file was found for the time range %s - %s" %(startTime, endTime)
return None, None
print "%d file(s) was(were) found for the time range: %s - %s" %(len(filenameList), startTime, endTime)
print
for i in range(len(filenameList)):
print "%s -> [%s]" %(filenameList[i], datetimeList[i].ctime())
self.filenameList = filenameList
self.datetimeList = datetimeList
return pathList, filenameList
def __isFileinThisTime(self, filename, startSeconds, endSeconds):
"""
Retorna 1 si el archivo de datos se encuentra dentro del rango de horas especificado.
Inputs:
filename : nombre completo del archivo de datos en formato Jicamarca (.r)
startTime : tiempo inicial del rango seleccionado en formato datetime.time
endTime : tiempo final del rango seleccionado en formato datetime.time
Return:
Boolean : Retorna True si el archivo de datos contiene datos en el rango de
fecha especificado, de lo contrario retorna False.
Excepciones:
Si el archivo no existe o no puede ser abierto
Si la cabecera no puede ser leida.
"""
try:
fp = fp = h5py.File(filename,'r')
except IOError:
traceback.print_exc()
raise IOError, "The file %s can't be opened" %(filename)
grp = fp['Data']
timeAux = grp['time']
time0 = timeAux[:][0].astype(numpy.float) #Time Vector
fp.close()
if self.timezone == 'lt':
time0 -= 5*3600
boolTimer = numpy.logical_and(time0 >= startSeconds,time0 < endSeconds)
if not (numpy.any(boolTimer)):
return None
thisDatetime = datetime.datetime.utcfromtimestamp(time0[0])
return thisDatetime
def __checkPath(self):
if os.path.exists(self.path):
self.status = 1
else:
self.status = 0
print 'Path:%s does not exists'%self.path
return
def __setNextFileOffline(self):
idFile = self.fileIndex
idFile += 1
if not(idFile < len(self.filenameList)):
print "No more Files"
return 0
filename = self.filenameList[idFile]
filePointer = h5py.File(filename,'r')
self.flagIsNewFile = 1
self.fileIndex = idFile
self.filename = filename
self.fp = filePointer
print "Setting the file: %s"%self.filename
self.__readMetadata()
self.__setBlockList()
# self.nRecords = self.fp['Data'].attrs['blocksPerFile']
self.nRecords = self.fp['Data'].attrs['nRecords']
self.blockIndex = 0
return 1
def __setBlockList(self):
'''
self.fp
self.startDateTime
self.endDateTime
self.blockList
self.blocksPerFile
'''
filePointer = self.fp
secStart = self.secStart
secEnd = self.secEnd
grp = filePointer['Data']
timeVector = grp['time'].value.astype(numpy.float)[0]
if self.timezone == 'lt':
timeVector -= 5*3600
ind = numpy.where(numpy.logical_and(timeVector >= secStart , timeVector < secEnd))[0]
self.blockList = ind
self.blocksPerFile = len(ind)
return
def __readMetadata(self):
'''
self.pathMeta
self.listShapes
self.listMetaname
self.listMeta
'''
grp = self.fp['Data']
pathMeta = os.path.join(self.path, grp.attrs['metadata'])
if pathMeta == self.pathMeta:
return
else:
self.pathMeta = pathMeta
filePointer = h5py.File(self.pathMeta,'r')
groupPointer = filePointer['Metadata']
listMetaname = []
listMetadata = []
for item in groupPointer.items():
name = item[0]
if name=='array dimensions':
table = groupPointer[name][:]
listShapes = {}
for shapes in table:
listShapes[shapes[0]] = numpy.array([shapes[1],shapes[2],shapes[3],shapes[4]])
else:
data = groupPointer[name].value
listMetaname.append(name)
listMetadata.append(data)
if name=='type':
self.__initDataOut(data)
filePointer.close()
self.listShapes = listShapes
self.listMetaname = listMetaname
self.listMeta = listMetadata
return
def __readData(self):
grp = self.fp['Data']
listdataname = []
listdata = []
for item in grp.items():
name = item[0]
if name == 'time':
listdataname.append('utctime')
timeAux = grp[name].value.astype(numpy.float)[0]
listdata.append(timeAux)
continue
listdataname.append(name)
array = self.__setDataArray(self.nRecords, grp[name],self.listShapes[name])
listdata.append(array)
self.listDataname = listdataname
self.listData = listdata
return
def __setDataArray(self, nRecords, dataset, shapes):
nChannels = shapes[0] #Dimension 0
nPoints = shapes[1] #Dimension 1, number of Points or Parameters
nSamples = shapes[2] #Dimension 2, number of samples or ranges
mode = shapes[3]
# if nPoints>1:
# arrayData = numpy.zeros((nRecords,nChannels,nPoints,nSamples))
# else:
# arrayData = numpy.zeros((nRecords,nChannels,nSamples))
#
# chn = 'channel'
#
# for i in range(nChannels):
#
# data = dataset[chn + str(i)].value
#
# if nPoints>1:
# data = numpy.rollaxis(data,2)
#
# arrayData[:,i,:] = data
arrayData = numpy.zeros((nRecords,nChannels,nPoints,nSamples))
doSqueeze = False
if mode == 0:
strds = 'channel'
nDatas = nChannels
newShapes = (nRecords,nPoints,nSamples)
if nPoints == 1:
doSqueeze = True
axisSqueeze = 2
else:
strds = 'param'
nDatas = nPoints
newShapes = (nRecords,nChannels,nSamples)
if nChannels == 1:
doSqueeze = True
axisSqueeze = 1
for i in range(nDatas):
data = dataset[strds + str(i)].value
data = data.reshape(newShapes)
if mode == 0:
arrayData[:,i,:,:] = data
else:
arrayData[:,:,i,:] = data
if doSqueeze:
arrayData = numpy.squeeze(arrayData, axis=axisSqueeze)
return arrayData
def __initDataOut(self, type):
# if type =='Parameters':
# self.dataOut = Parameters()
# elif type =='Spectra':
# self.dataOut = Spectra()
# elif type =='Voltage':
# self.dataOut = Voltage()
# elif type =='Correlation':
# self.dataOut = Correlation()
return
def __setDataOut(self):
listMeta = self.listMeta
listMetaname = self.listMetaname
listDataname = self.listDataname
listData = self.listData
blockIndex = self.blockIndex
blockList = self.blockList
for i in range(len(listMeta)):
setattr(self.dataOut,listMetaname[i],listMeta[i])
for j in range(len(listData)):
if listDataname[j]=='utctime':
# setattr(self.dataOut,listDataname[j],listData[j][blockList[blockIndex]])
setattr(self.dataOut,'utctimeInit',listData[j][blockList[blockIndex]])
continue
setattr(self.dataOut,listDataname[j],listData[j][blockList[blockIndex],:])
return self.dataOut.data_param
def getData(self):
# if self.flagNoMoreFiles:
# self.dataOut.flagNoData = True
# print 'Process finished'
# return 0
#
if self.blockIndex==self.blocksPerFile:
if not( self.__setNextFileOffline() ):
self.dataOut.flagNoData = True
return 0
#
# if self.datablock == None: # setear esta condicion cuando no hayan datos por leers
# self.dataOut.flagNoData = True
# return 0
self.__readData()
self.__setDataOut()
self.dataOut.flagNoData = False
self.blockIndex += 1
return
def run(self, **kwargs):
if not(self.isConfig):
self.setup(**kwargs)
# self.setObjProperties()
self.isConfig = True
self.getData()
return
class HDF5Writer(Operation):
ext = ".hdf5"
optchar = "D"
metaoptchar = "M"
metaFile = None
filename = None
path = None
setFile = None
fp = None
grp = None
ds = None
firsttime = True
#Configurations
blocksPerFile = None
blockIndex = None
dataOut = None
#Data Arrays
dataList = None
metadataList = None
arrayDim = None
tableDim = None
# dtype = [('arrayName', 'S20'),('nChannels', 'i'), ('nPoints', 'i'), ('nSamples', 'i'),('mode', 'b')]
dtype = [('arrayName', 'S20'),('nDimensions', 'i'), ('dim2', 'i'), ('dim1', 'i'),('dim0', 'i'),('mode', 'b')]
mode = None
nDatas = None #Number of datasets to be stored per array
nDims = None #Number Dimensions in each dataset
nDimsForDs = None
def __init__(self):
Operation.__init__(self)
self.isConfig = False
return
def setup(self, dataOut, **kwargs):
self.path = kwargs['path']
if kwargs.has_key('ext'):
self.ext = kwargs['ext']
if kwargs.has_key('blocksPerFile'):
self.blocksPerFile = kwargs['blocksPerFile']
else:
self.blocksPerFile = 10
self.metadataList = kwargs['metadataList']
self.dataList = kwargs['dataList']
self.dataOut = dataOut
if kwargs.has_key('mode'):
mode = kwargs['mode']
if type(mode) == int:
mode = numpy.zeros(len(self.dataList)) + mode
else:
mode = numpy.zeros(len(self.dataList))
self.mode = mode
arrayDim = numpy.zeros((len(self.dataList),5))
#Table dimensions
dtype0 = self.dtype
tableList = []
for i in range(len(self.dataList)):
dataAux = getattr(self.dataOut, self.dataList[i])
if type(dataAux)==float or type(dataAux)==int:
arrayDim[i,0] = 1
else:
arrayDim0 = dataAux.shape
arrayDim[i,0] = len(arrayDim0)
arrayDim[i,4] = mode[i]
if len(arrayDim0) == 3:
arrayDim[i,1:-1] = numpy.array(arrayDim0)
elif len(arrayDim0) == 2:
arrayDim[i,2:-1] = numpy.array(arrayDim0) #nHeights
elif len(arrayDim0) == 1:
arrayDim[i,3] = arrayDim0
elif len(arrayDim0) == 0:
arrayDim[i,0] = 1
arrayDim[i,3] = 1
table = numpy.array((self.dataList[i],) + tuple(arrayDim[i,:]),dtype = dtype0)
tableList.append(table)
self.arrayDim = arrayDim
self.tableDim = numpy.array(tableList, dtype = dtype0)
self.blockIndex = 0
return
def putMetadata(self):
fp = self.createMetadataFile()
self.writeMetadata(fp)
fp.close()
return
def createMetadataFile(self):
ext = self.ext
path = self.path
setFile = self.setFile
timeTuple = time.localtime(self.dataOut.utctime)
subfolder = ''
fullpath = os.path.join( path, subfolder )
if not( os.path.exists(fullpath) ):
os.mkdir(fullpath)
setFile = -1 #inicializo mi contador de seteo
subfolder = 'd%4.4d%3.3d' % (timeTuple.tm_year,timeTuple.tm_yday)
fullpath = os.path.join( path, subfolder )
if not( os.path.exists(fullpath) ):
os.mkdir(fullpath)
setFile = -1 #inicializo mi contador de seteo
else:
filesList = os.listdir( fullpath )
filesList = sorted( filesList, key=str.lower )
if len( filesList ) > 0:
filesList = [k for k in filesList if 'M' in k]
filen = filesList[-1]
# el filename debera tener el siguiente formato
# 0 1234 567 89A BCDE (hex)
# x YYYY DDD SSS .ext
if isNumber( filen[8:11] ):
setFile = int( filen[8:11] ) #inicializo mi contador de seteo al seteo del ultimo file
else:
setFile = -1
else:
setFile = -1 #inicializo mi contador de seteo
setFile += 1
file = '%s%4.4d%3.3d%3.3d%s' % (self.metaoptchar,
timeTuple.tm_year,
timeTuple.tm_yday,
setFile,
ext )
filename = os.path.join( path, subfolder, file )
self.metaFile = file
#Setting HDF5 File
fp = h5py.File(filename,'w')
return fp
def writeMetadata(self, fp):
grp = fp.create_group("Metadata")
grp.create_dataset('array dimensions', data = self.tableDim, dtype = self.dtype)
for i in range(len(self.metadataList)):
grp.create_dataset(self.metadataList[i], data=getattr(self.dataOut, self.metadataList[i]))
return
def setNextFile(self):
ext = self.ext
path = self.path
setFile = self.setFile
mode = self.mode
timeTuple = time.localtime(self.dataOut.utctime)
subfolder = 'd%4.4d%3.3d' % (timeTuple.tm_year,timeTuple.tm_yday)
fullpath = os.path.join( path, subfolder )
if os.path.exists(fullpath):
filesList = os.listdir( fullpath )
filesList = [k for k in filesList if 'D' in k]
if len( filesList ) > 0:
filesList = sorted( filesList, key=str.lower )
filen = filesList[-1]
# el filename debera tener el siguiente formato
# 0 1234 567 89A BCDE (hex)
# x YYYY DDD SSS .ext
if isNumber( filen[8:11] ):
setFile = int( filen[8:11] ) #inicializo mi contador de seteo al seteo del ultimo file
else:
setFile = -1
else:
setFile = -1 #inicializo mi contador de seteo
setFile += 1
file = '%s%4.4d%3.3d%3.3d%s' % (self.optchar,
timeTuple.tm_year,
timeTuple.tm_yday,
setFile,
ext )
filename = os.path.join( path, subfolder, file )
#Setting HDF5 File
fp = h5py.File(filename,'w')
grp = fp.create_group("Data")
grp.attrs['metadata'] = self.metaFile
# grp.attrs['blocksPerFile'] = 0
ds = []
data = []
nDimsForDs = []
nDatas = numpy.zeros(len(self.dataList))
nDims = self.arrayDim[:,0]
nDim1 = self.arrayDim[:,2]
nDim0 = self.arrayDim[:,3]
for i in range(len(self.dataList)):
if nDims[i]==1:
# ds0 = grp.create_dataset(self.dataList[i], (1,1), maxshape=(1,self.blocksPerFile) , chunks = True, dtype='S20')
ds0 = grp.create_dataset(self.dataList[i], (1,1), maxshape=(1,self.blocksPerFile) , chunks = True, dtype=numpy.float64)
ds.append(ds0)
data.append([])
nDimsForDs.append(nDims[i])
else:
if mode[i]==0:
strMode = "channel"
nDatas[i] = self.arrayDim[i,1]
else:
strMode = "param"
nDatas[i] = self.arrayDim[i,2]
if nDims[i]==2:
nDatas[i] = self.arrayDim[i,2]
grp0 = grp.create_group(self.dataList[i])
for j in range(int(nDatas[i])):
tableName = strMode + str(j)
if nDims[i] == 3:
ds0 = grp0.create_dataset(tableName, (nDim1[i],nDim0[i],1) , data = numpy.zeros((nDim1[i],nDim0[i],1)) ,maxshape=(None,nDim0[i],None), chunks=True)
else:
ds0 = grp0.create_dataset(tableName, (1,nDim0[i]), data = numpy.zeros((1,nDim0[i])) , maxshape=(None,nDim0[i]), chunks=True)
ds.append(ds0)
data.append([])
nDimsForDs.append(nDims[i])
self.nDatas = nDatas
self.nDims = nDims
self.nDimsForDs = nDimsForDs
#Saving variables
print 'Writing the file: %s'%filename
self.filename = filename
self.fp = fp
self.grp = grp
self.grp.attrs.modify('nRecords', 1)
self.ds = ds
self.data = data
self.setFile = setFile
self.firsttime = True
self.blockIndex = 0
return
def putData(self):
if not self.firsttime:
self.fp.flush()
self.fp.close()
self.readBlock()
if self.blockIndex == self.blocksPerFile:
self.setNextFile()
self.setBlock()
self.writeBlock()
return
def readBlock(self):
'''
data Array configured
self.data
'''
ds = self.ds
#Setting HDF5 File
fp = h5py.File(self.filename,'r+')
grp = fp["Data"]
ind = 0
# grp.attrs['blocksPerFile'] = 0
for i in range(len(self.dataList)):
if self.nDims[i]==1:
ds0 = grp[self.dataList[i]]
ds[ind] = ds0
ind += 1
else:
if self.mode[i]==0:
strMode = "channel"
else:
strMode = "param"
grp0 = grp[self.dataList[i]]
for j in range(int(self.nDatas[i])):
tableName = strMode + str(j)
ds0 = grp0[tableName]
ds[ind] = ds0
ind += 1
self.fp = fp
self.grp = grp
self.ds = ds
return
def setBlock(self):
'''
data Array configured
self.data
'''
#Creating Arrays
data = self.data
nDatas = self.nDatas
nDims = self.nDims
mode = self.mode
ind = 0
for i in range(len(self.dataList)):
dataAux = getattr(self.dataOut,self.dataList[i])
if nDims[i] == 1:
# data[ind] = numpy.array([str(dataAux)]).reshape((1,1))
data[ind] = dataAux
# if not self.firsttime:
# data[ind] = numpy.hstack((self.ds[ind][:], self.data[ind]))
ind += 1
else:
for j in range(int(nDatas[i])):
if (mode[i] == 0) or (nDims[i] == 2): #In case division per channel or Dimensions is only 1
data[ind] = dataAux[j,:]
else:
data[ind] = dataAux[:,j,:]
# if nDims[i] == 3:
# data[ind] = data[ind].reshape((data[ind].shape[0],data[ind].shape[1],1))
# if not self.firsttime:
# data[ind] = numpy.dstack((self.ds[ind][:], data[ind]))
# else:
# data[ind] = data[ind].reshape((1,data[ind].shape[0]))
# if not self.firsttime:
# data[ind] = numpy.vstack((self.ds[ind][:], data[ind]))
ind += 1
self.data = data
return
def writeBlock(self):
'''
Saves the block in the HDF5 file
'''
for i in range(len(self.ds)):
if self.firsttime:
# self.ds[i].resize(self.data[i].shape)
# self.ds[i][self.blockIndex,:] = self.data[i]
if type(self.data[i]) == numpy.ndarray:
nDims1 = len(self.ds[i].shape)
if nDims1 == 3:
self.data[i] = self.data[i].reshape((self.data[i].shape[0],self.data[i].shape[1],1))
self.ds[i].resize(self.data[i].shape)
self.ds[i][:] = self.data[i]
else:
if self.nDimsForDs[i] == 1:
self.ds[i].resize((self.ds[i].shape[0], self.ds[i].shape[1] + 1))
self.ds[i][0,-1] = self.data[i]
elif self.nDimsForDs[i] == 2:
self.ds[i].resize((self.ds[i].shape[0] + 1,self.ds[i].shape[1]))
self.ds[i][self.blockIndex,:] = self.data[i]
elif self.nDimsForDs[i] == 3:
dataShape = self.data[i].shape
dsShape = self.ds[i].shape
if dataShape[0]==dsShape[0]:
self.ds[i].resize((self.ds[i].shape[0],self.ds[i].shape[1],self.ds[i].shape[2]+1))
self.ds[i][:,:,-1] = self.data[i]
else:
self.ds[i].resize((self.ds[i].shape[0] + dataShape[0],self.ds[i].shape[1],self.ds[i].shape[2]))
self.ds[i][dsShape[0]:,:,0] = self.data[i]
# self.ds[i].append(self.data[i])
# self.fp.flush()
# if not self.firsttime:
# self.fp.root.Data._v_attrs.nRecords = self.blockIndex
# if self.firsttime:
# self.fp.close()
# self.readBlock2()
self.blockIndex += 1
self.firsttime = False
return
def run(self, dataOut, **kwargs):
if not(self.isConfig):
self.setup(dataOut, **kwargs)
self.isConfig = True
self.putMetadata()
self.setNextFile()
self.putData()
return