"""
Data
----
Define some datasets as functions. In each case, an instance of :class:`Data`
is returned to have a consistant representation.
* :class:`Data`,
* :func:`el_nino`,
* :func:`tahiti`,
* :func:`mascaret`,
* :func:`marthe`.
"""
import collections
import os
import logging
import numpy as np
[docs]class Data(collections.Mapping):
"""Wrap datasets into a Mapping container.
Store a dataset allong with some informations about it.
:attr:`data` corresponds to model's output and :attr:`sample` to the
corresponding inputs.
Structured array are created for both :attr:`data` and :attr:`sample`.
This allows to access values using either normal indexing or attribute
indexing by use of labels' features.
If required, :meth:`toarray` convert both :attr:`data` and :attr:`sample`
into regular arrays.
"""
logger = logging.getLogger(__name__)
[docs] def __init__(self, data, desc, sample=None, plabels=None, flabels=None):
"""Dataset container.
Both :attr:`data` and :attr:`sample` are required to be 2D arrays.
Thus with one feature, shape must be (n_samples, 1).
:param array_like data: (n_features, n_samples).
:param str desc: dataset description.
:param array_like sample: sampling used to create the data
(n_features, n_samples).
:param list(str) plabels: parameters' labels (n_features,).
:param list(str) flabel: name of the quantities of interest
(n_features,).
"""
self.desc = desc
self.plabels = plabels
self.flabels = flabels
# Dataset conversion to structured arrays
if self.plabels is not None:
dt_sample = {'names': self.plabels,
'formats': ['f8'] * len(self.plabels)}
else:
dt_sample = None
if self.flabels is not None:
dt_data = {'names': self.flabels,
'formats': ['f8'] * len(self.flabels)}
else:
dt_data = None
self.shape = data.shape if sample is None else (sample.shape, data.shape)
self.in_shape = sample.shape[1] if sample is not None else None
self.data = np.asarray([tuple(datum) for datum in data], dtype=dt_data)
self.sample = np.asarray([tuple(snap) for snap in sample],
dtype=dt_sample) if sample is not None else None
if (self.sample is not None) and (len(self.sample) != len(self.data)):
self.logger.error("Sample shape not consistent with data shape: "
"{} != {}".format(len(self.sample), len(self.data)))
raise SystemError
[docs] def toarray(self):
"""Convert the structured array to regular arrays.
This will prevent the hability to access :attr:`sample` and
:attr:`data` using attributes from respective labels.
"""
self.data = self.data.view((self.data.dtype[0],
len(self.data.dtype.names)))
self.sample = self.sample.view((self.sample.dtype[0],
len(self.sample.dtype.names)))
def __getitem__(self, key):
"""Return the corresponding data or a tuple of (sample, data)."""
return self.data[key] if self.sample is None else\
(self.sample[key], self.data[key])
def __iter__(self):
"""Iterate over data or a zip of sample and data."""
return iter(self.data) if self.sample is None else\
iter(zip(self.sample, self.data))
def __len__(self):
"""Based on the number of sample."""
return len(self.data)
def __str__(self):
"""Describe and summarize."""
return self.desc + '\n\n' + self.__repr__()
def __repr__(self):
"""Summarize the container."""
msg = ("Dataset summary:\n"
"-> Input dimension: {}\n"
"-> Output dimension: {}\n"
"-> Number of samples: {}\n"
"-> Input labels:\n{}\n"
"-> Output labels:\n{}\n"
).format(self.in_shape, self.shape[1][1], self.shape[0][0],
self.plabels, self.flabels)
return msg
# Common path
PATH = os.path.dirname(os.path.realpath(__file__))
[docs]def el_nino():
"""El Nino dataset."""
desc = ("Averaged monthly sea surface temperature (SST) in degrees Celcius"
" of the Pacific Ocean at 0-10 deg South and 90-80 deg West"
" between 1950 and 2007.\nSource: NOAA - ERSSTv5 - Nino 1+2 at"
" http://www.cpc.ncep.noaa.gov/data/indices/")
labels, data = np.loadtxt(os.path.join(PATH, 'elnino.dat'),
skiprows=1, usecols=(0, 2), unpack=True)
labels = labels.reshape(-1, 12)[:, 0]
data = data.reshape(-1, 12)
flabels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
return Data(data=data, desc=desc, sample=labels.reshape(-1, 1),
plabels=['Year'], flabels=flabels)
[docs]def tahiti():
"""Tahiti dataset."""
desc = ("Averaged monthly sea level pressure (SLP) in millibars"
"at Tahiti between 1951 and 2016.\nSource: NOAA - Tahiti SLP at"
" http://www.cpc.ncep.noaa.gov/data/indices/")
dataset = np.loadtxt(os.path.join(PATH, 'tahiti.dat'),
skiprows=4, usecols=range(0, 13))
labels = dataset[:, 0].reshape(-1, 1)
data = dataset[:, 1:]
flabels = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
return Data(data=data, desc=desc, sample=labels,
plabels=['Year'], flabels=flabels)
[docs]def mascaret():
"""Mascaret dataset."""
desc = ("Monte-Carlo sampling simulated using MASCARET flow solver."
" The Garonne river was used and the output consists in 14 water"
" height observations. Two random variables are used:"
" the friction coefficient Ks~U(15, 60) and the mass flow"
" rate Q~N(4035, 400).")
sample = np.load(os.path.join(PATH, 'input_mascaret.npy'))
data = np.load(os.path.join(PATH, 'output_mascaret.npy'))
flabels = ['13150', '19450', '21825', '21925', '25775', '32000',
'36131.67', '36240', '36290', '38230.45', '44557.5', '51053.33',
'57550', '62175']
return Data(data=data, desc=desc, sample=sample,
plabels=['Ks', 'Q'], flabels=flabels)
[docs]def marthe():
"""MARTHE dataset."""
desc = ("In 2005, CEA (France) and Kurchatov Institute (Russia) developed"
" a model of strontium 90 migration in a porous water-saturated"
" medium. The scenario concerned the temporary storage of"
" radioactive waste (STDR) in a site close to Moscow. The main"
" purpose was to predict the transport of 90Sr between 2002 and"
" 2010, in order to determine the aquifer contamination. The"
" numerical simulation of the 90Sr transport in the upper aquifer"
" of the site was realized via the MARTHE code"
" (developed by BRGM, France).")
dataset = np.loadtxt(os.path.join(PATH, 'marthe.dat'), skiprows=1)
plabels = ['per1', 'per2', 'per3', 'perz1', 'perz2', 'perz3', 'perz4',
'd1', 'd2', 'd3', 'dt1', 'dt2', 'dt3', 'kd1', 'kd2', 'kd3',
'poros', 'i1', 'i2', 'i3']
flabels = ['p102K', 'p104', 'p106', 'p2.76', 'p29K',
'p31K', 'p35K', 'p37K', 'p38', 'p4b']
return Data(data=dataset[:, 20:], desc=desc, sample=dataset[:, :20],
plabels=plabels, flabels=flabels)