Source code for dsgrid.dataformat.datatable

import logging

import numpy as np
import pandas as pd

from dsgrid.helpers import h5Reader
from dsgrid.dataformat.sectordataset import NULL_IDX

logger = logging.getLogger(__name__)

[docs]class Datatable(object): def __init__(self,datafile,sort=True,verify_integrity=True): self.sector_enum = datafile.sector_enum self.geo_enum = datafile.geo_enum self.enduse_enum = datafile.enduse_enum self.time_enum = datafile.time_enum self.data = [] with h5Reader(datafile.h5path) as f: for sectorname, sectordataset in datafile.sectordata.items(): dgroup = f["data/" + sectorname] dset = dgroup["data"] geo_mappings = dgroup["geographies"][:] geo_idxs = np.flatnonzero(geo_mappings["idx"] != NULL_IDX) geo_ids = np.array(self.geo_enum.ids)[geo_idxs] geo_dset_idxs = geo_mappings["idx"][geo_idxs] geo_scales = geo_mappings["scale"][geo_idxs] index = self._categoricalmultiindex( [sectorname], geo_ids, sectordataset.enduses, sectordataset.times) data = dset[:, :, :] data = data[geo_dset_idxs, :, :] * geo_scales.reshape(-1,1,1) data = pd.Series(data.reshape(-1),index=index,dtype="float32") self.data.append(data) if self.data: self.data = pd.concat(self.data,verify_integrity=verify_integrity,copy=False) else: logger.warning("No data in {}".format(datafile.filepath)) self.data = pd.Series(dtype='float32') self.sorted = False; self.warned = False if sort: self.sort()
[docs] def sort(self): if not self.sorted: self.data.sort_index(inplace=True) self.sorted = True return
def __getitem__(self, idxs): if (not self.warned) and (not self.sorted): logger.warning("Datatable is not sorted. Some kinds of indexing may be unreliable.") self.warned = True if len(idxs) != 4: raise KeyError("Indexing into a Datatable requires indices " + "for all four dimensions") return self.data.loc[idxs] def _categoricalmultiindex(self, sectors, geos, enduses, times): sectors = pd.CategoricalIndex( data = sectors, categories=self.sector_enum.ids) geos = pd.CategoricalIndex( data = geos, categories = self.geo_enum.ids) enduses = pd.CategoricalIndex( data = enduses, categories = self.enduse_enum.ids) times = pd.CategoricalIndex( data = times, categories = self.time_enum.ids) return pd.MultiIndex.from_product( [sectors, geos, enduses, times], names=["sector", "geography", "enduse", "time"])