Inference Data Cookbook

InferenceData is the central data format for ArviZ. InferenceData itself is just a container that maintains references to one or more xarray.Dataset. See the InferenceData structure specification here. Below are various ways to generate an InferenceData object. See here for more on xarray.

[12]:
import arviz as az
import numpy as np

From 1d numpy array

[13]:
size = 100
dataset = az.convert_to_inference_data(np.random.randn(size))
print(dataset)
dataset.posterior
Inference data with groups:
        > posterior
[13]:
<xarray.Dataset>
Dimensions:  (chain: 1, draw: 100)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
Data variables:
    x        (chain, draw) float64 -0.5987 -1.116 0.7667 ... -1.018 -0.07785
Attributes:
    created_at:  2019-09-15T17:56:22.170325

From nd numpy array

[14]:
shape = (1, 2, 3, 4, 5)
dataset = az.convert_to_inference_data(np.random.randn(*shape))
print(dataset)
dataset.posterior
Inference data with groups:
        > posterior
[14]:
<xarray.Dataset>
Dimensions:  (chain: 1, draw: 2, x_dim_0: 3, x_dim_1: 4, x_dim_2: 5)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1
  * x_dim_0  (x_dim_0) int64 0 1 2
  * x_dim_1  (x_dim_1) int64 0 1 2 3
  * x_dim_2  (x_dim_2) int64 0 1 2 3 4
Data variables:
    x        (chain, draw, x_dim_0, x_dim_1, x_dim_2) float64 0.3827 ... -1.507
Attributes:
    created_at:  2019-09-15T17:56:22.182246

From a dictionary

[15]:
datadict = {
    'a': np.random.randn(100),
    'b': np.random.randn(1, 100, 10),
    'c': np.random.randn(1, 100, 3, 4),
}
dataset = az.convert_to_inference_data(datadict)
print(dataset)
dataset.posterior
Inference data with groups:
        > posterior
[15]:
<xarray.Dataset>
Dimensions:  (b_dim_0: 10, c_dim_0: 3, c_dim_1: 4, chain: 1, draw: 100)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
  * b_dim_0  (b_dim_0) int64 0 1 2 3 4 5 6 7 8 9
  * c_dim_0  (c_dim_0) int64 0 1 2
  * c_dim_1  (c_dim_1) int64 0 1 2 3
Data variables:
    a        (chain, draw) float64 0.6765 -0.382 -0.2243 ... 1.022 -0.5525
    b        (chain, draw, b_dim_0) float64 -0.3869 -0.5103 ... 0.2466 0.608
    c        (chain, draw, c_dim_0, c_dim_1) float64 -0.8396 -1.368 ... 1.384
Attributes:
    created_at:  2019-09-15T17:56:22.202381

From dictionary with coords and dims

[16]:
datadict = {
    'a': np.random.randn(100),
    'b': np.random.randn(1, 100, 10),
    'c': np.random.randn(1, 100, 3, 4),
}
coords = {
    'c1' : np.arange(3),
    'c2' : np.arange(4),
    'b1' : np.arange(10)
}
dims = {
    'b' : ['b1'],
    'c' : ['c1', 'c2']
}

dataset = az.convert_to_inference_data(
    datadict,
    coords=coords,
    dims=dims
)
print(dataset)
dataset.posterior
Inference data with groups:
        > posterior
[16]:
<xarray.Dataset>
Dimensions:  (b1: 10, c1: 3, c2: 4, chain: 1, draw: 100)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
  * b1       (b1) int64 0 1 2 3 4 5 6 7 8 9
  * c1       (c1) int64 0 1 2
  * c2       (c2) int64 0 1 2 3
Data variables:
    a        (chain, draw) float64 1.252 -1.145 -0.09127 ... 0.8395 -0.1739
    b        (chain, draw, b1) float64 -2.811 -0.1507 -0.481 ... -0.2532 1.098
    c        (chain, draw, c1, c2) float64 -1.84 -0.2114 ... -0.005779 -0.07
Attributes:
    created_at:  2019-09-15T17:56:22.225542

From pymc3

[17]:
import pymc3 as pm
draws = 500
chains = 2

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

with pm.Model() as model:
    mu = pm.Normal('mu', mu=0, sd=5)
    tau = pm.HalfCauchy('tau', beta=5)
    theta_tilde = pm.Normal('theta_tilde', mu=0, sd=1, shape=eight_school_data['J'])
    theta = pm.Deterministic('theta', mu + tau * theta_tilde)
    pm.Normal('obs', mu=theta, sd=eight_school_data['sigma'], observed=eight_school_data['y'])

    trace = pm.sample(draws, chains=chains)
    prior = pm.sample_prior_predictive()
    posterior_predictive = pm.sample_posterior_predictive(trace)

    pm_data = az.from_pymc3(
            trace=trace,
            prior=prior,
            posterior_predictive=posterior_predictive,
            coords={'school': np.arange(eight_school_data['J'])},
            dims={'theta': ['school'], 'theta_tilde': ['school']},
        )
pm_data
Auto-assigning NUTS sampler...
INFO:pymc3:Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
INFO:pymc3:Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
INFO:pymc3:Multiprocess sampling (2 chains in 4 jobs)
NUTS: [theta_tilde, tau, mu]
INFO:pymc3:NUTS: [theta_tilde, tau, mu]
Sampling 2 chains, 0 divergences: 100%|██████████| 2000/2000 [00:01<00:00, 1977.81draws/s]
100%|██████████| 1000/1000 [00:00<00:00, 2114.22it/s]
[17]:
Inference data with groups:
        > posterior
        > sample_stats
        > posterior_predictive
        > prior
        > observed_data

From pystan

[18]:
import pystan

schools_code = """
data {
    int<lower=0> J;
    real y[J];
    real<lower=0> sigma[J];
}

parameters {
    real mu;
    real<lower=0> tau;
    real theta_tilde[J];
}

transformed parameters {
    real theta[J];
    for (j in 1:J)
        theta[j] = mu + tau * theta_tilde[j];
}

model {
    mu ~ normal(0, 5);
    tau ~ cauchy(0, 5);
    theta_tilde ~ normal(0, 1);
    y ~ normal(theta, sigma);
}

generated quantities {
    vector[J] log_lik;
    vector[J] y_hat;
    for (j in 1:J) {
        log_lik[j] = normal_lpdf(y[j] | theta[j], sigma[j]);
        y_hat[j] = normal_rng(theta[j], sigma[j]);
    }
}
"""

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

stan_model = pystan.StanModel(model_code=schools_code)
fit = stan_model.sampling(data=eight_school_data, control={"adapt_delta" : 0.9})

stan_data = az.from_pystan(
    posterior=fit,
    posterior_predictive='y_hat',
    observed_data=['y'],
    log_likelihood='log_lik',
    coords={'school': np.arange(eight_school_data['J'])},
    dims={
        'theta': ['school'],
        'y': ['school'],
        'log_lik': ['school'],
        'y_hat': ['school'],
        'theta_tilde': ['school']
    }
)

stan_data
INFO:pystan:COMPILING THE C++ CODE FOR MODEL anon_model_9d743830ec4a29fb58eb4660d4b9417f NOW.
[18]:
Inference data with groups:
        > posterior
        > sample_stats
        > posterior_predictive
        > observed_data

From pyro

[19]:
import torch

import pyro
import pyro.distributions as dist
from pyro.infer import MCMC, NUTS, Predictive

pyro.enable_validation(True)
pyro.set_rng_seed(0)

draws = 500
chains = 2
eight_school_data = {
    'J': 8,
    'y': torch.tensor([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': torch.tensor([15., 10., 16., 11., 9., 11., 10., 18.])
}

def model(J, sigma, y=None):
    mu = pyro.sample('mu', dist.Normal(0, 5))
    tau = pyro.sample('tau', dist.HalfCauchy(5))
    with pyro.plate('J', J):
        theta_tilde = pyro.sample('theta_tilde', dist.Normal(0, 1))
        theta = mu + tau * theta_tilde
        return pyro.sample("obs", dist.Normal(theta, sigma), obs=y)

nuts_kernel = NUTS(model, jit_compile=True, ignore_jit_warnings=True)
mcmc = MCMC(nuts_kernel, num_samples=draws, warmup_steps=draws,
            num_chains=chains, disable_progbar=True)
mcmc.run(**eight_school_data)
posterior_samples = mcmc.get_samples()
posterior_predictive = Predictive(model, posterior_samples).get_samples(
    eight_school_data['J'], eight_school_data['sigma'])
prior = Predictive(model, num_samples=500).get_samples(
    eight_school_data['J'], eight_school_data['sigma'])

pyro_data = az.from_pyro(mcmc, prior=prior, posterior_predictive=posterior_predictive,
                         coords={'school': np.arange(eight_school_data['J'])},
                         dims={'theta': ['school']})
pyro_data
[19]:
Inference data with groups:
        > posterior
        > sample_stats
        > posterior_predictive
        > prior

From emcee

[20]:
import emcee

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

def log_prior_8school(theta,J):
    mu = theta[0]
    tau = theta[1]
    eta = theta[2:]
    # Half-cauchy prior
    if tau<0:
        return -np.inf
    hwhm = 25
    prior_tau = -np.log(tau**2+hwhm**2)
    prior_mu = -(mu/10)**2  # normal prior, loc=0, scale=10
    prior_eta = -np.sum(eta**2)  # normal prior, loc=0, scale=1
    return prior_mu + prior_tau + prior_eta

def log_likelihood_8school(theta,y,sigma):
    mu = theta[0]
    tau = theta[1]
    eta = theta[2:]
    return -np.sum(((mu + tau * eta - y) / sigma)**2)

def lnprob_8school(theta,J,y,sigma):
    prior = log_prior_8school(theta,J)
    if prior <= -np.inf:
        return -np.inf
    like = log_likelihood_8school(theta,y,sigma)
    return like+prior

nwalkers = 40
ndim = eight_school_data['J']+2
draws = 1500
pos = np.random.normal(size=(nwalkers,ndim))
pos[:,1] = np.absolute(pos[:,1])
sampler = emcee.EnsembleSampler(nwalkers,
                                ndim,
                                lnprob_8school,
                                args=(eight_school_data['J'],
                                      eight_school_data['y'],
                                      eight_school_data['sigma']
                                     )
                               )
sampler.run_mcmc(pos, draws)

# define variable names, it cannot be inferred from emcee
var_names = ['mu','tau']+['eta{}'.format(i) for i in range(eight_school_data['J'])]
emcee_data = az.from_emcee(sampler, var_names = var_names)
emcee_data
[20]:
Inference data with groups:
        > posterior
        > observed_data

From CmdStanPy

[21]:
from cmdstanpy import Model, StanFit

schools_code = """
data {
    int<lower=0> J;
    real y[J];
    real<lower=0> sigma[J];
}

parameters {
    real mu;
    real<lower=0> tau;
    real theta_tilde[J];
}

transformed parameters {
    real theta[J];
    for (j in 1:J)
        theta[j] = mu + tau * theta_tilde[j];
}

model {
    mu ~ normal(0, 5);
    tau ~ cauchy(0, 5);
    theta_tilde ~ normal(0, 1);
    y ~ normal(theta, sigma);
}

generated quantities {
    vector[J] log_lik;
    vector[J] y_hat;
    for (j in 1:J) {
        log_lik[j] = normal_lpdf(y[j] | theta[j], sigma[j]);
        y_hat[j] = normal_rng(theta[j], sigma[j]);
    }
}
"""

with open("./eight_school.stan", "w") as f:
    print(schools_code, file=f)

stan_file = "./eight_school.stan"
stan_model = Model(stan_file=stan_file)
stan_model.compile()

eight_school_data = {'J': 8,
                     'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
                     'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
                    }

stan_fit = stan_model.sample(data=eight_school_data)

cmdstanpy_data = az.from_cmdstanpy(
    posterior=stan_fit,
    posterior_predictive='y_hat',
    observed_data={'y' : eight_school_data["y"]},
    log_likelihood='log_lik',
    coords={'school': np.arange(eight_school_data['J'])},
    dims={
        'theta': ['school'],
        'y': ['school'],
        'log_lik': ['school'],
        'y_hat': ['school'],
        'theta_tilde': ['school']
        }
)
cmdstanpy_data
INFO:cmdstanpy:compiling c++
INFO:cmdstanpy:compiled model file: /home/canyon/repos/arviz/doc/notebooks/eight_school
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:start chain 2
INFO:cmdstanpy:start chain 3
INFO:cmdstanpy:start chain 4
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:finish chain 2
INFO:cmdstanpy:finish chain 3
INFO:cmdstanpy:finish chain 4
[21]:
Inference data with groups:
        > posterior
        > sample_stats
        > posterior_predictive
        > observed_data

From CmdStan

See from_cmdstan for details.

CmdStan helpers

[22]:
# save for CmdStan example (needs CmdStanPy run)
stan_fit.save_csvfiles(dir="sample_data", basename="eight_school_samples")
[23]:
schools_code = """
data {
    int<lower=0> J;
    real y[J];
    real<lower=0> sigma[J];
}

parameters {
    real mu;
    real<lower=0> tau;
    real theta_tilde[J];
}

transformed parameters {
    real theta[J];
    for (j in 1:J)
        theta[j] = mu + tau * theta_tilde[j];
}

model {
    mu ~ normal(0, 5);
    tau ~ cauchy(0, 5);
    theta_tilde ~ normal(0, 1);
    y ~ normal(theta, sigma);
}

generated quantities {
    vector[J] log_lik;
    vector[J] y_hat;
    for (j in 1:J) {
        log_lik[j] = normal_lpdf(y[j] | theta[j], sigma[j]);
        y_hat[j] = normal_rng(theta[j], sigma[j]);
    }
}
"""

with open("./eight_school.stan", "w") as f:
    print(schools_code, file=f)
[24]:
eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}
[25]:
import pystan
pystan.stan_rdump(eight_school_data, "./eight_school.data.R")
[26]:
# Bash shell
#
# $ cd cmdstan
# $ make build
# $ make path/to/eight_school
# $ cd path/to
# $ for i in {1..4}
#   do
#     ./eight_school sample random seed=12345 \
#       id=$i data file=eight_school.data.R \
#       output file=sample_data/eight_school_samples-$i.csv &
#   done
# $
[27]:
# Let's use .stan and .csv files created/saved by the CmdStanPy procedure

# glob string
posterior_glob =  "sample_data/eight_school_samples-[0-9].csv"
# list of paths
# posterior_list =  [
#     "sample_data/eight_school_samples-1.csv",
#     "sample_data/eight_school_samples-2.csv",
#     "sample_data/eight_school_samples-3.csv",
#     "sample_data/eight_school_samples-4.csv",
# ]

obs_data_path = "./eight_school.data.R"

cmdstan_data = az.from_cmdstan(
    posterior = posterior_glob,
    posterior_predictive='y_hat',
    observed_data=obs_data_path,
    observed_data_var="y",
    log_likelihood='log_lik',
    coords={'school': np.arange(eight_school_data['J'])},
    dims={'theta': ['school'],
          'y': ['school'],
          'log_lik': ['school'],
          'y_hat': ['school'],
          'theta_tilde': ['school']
         }
)
cmdstan_data
arviz.data.io_cmdstan - INFO - glob found 4 files for 'posterior':
1: sample_data/eight_school_samples-1.csv
2: sample_data/eight_school_samples-2.csv
3: sample_data/eight_school_samples-3.csv
4: sample_data/eight_school_samples-4.csv
INFO:arviz.data.io_cmdstan:glob found 4 files for 'posterior':
1: sample_data/eight_school_samples-1.csv
2: sample_data/eight_school_samples-2.csv
3: sample_data/eight_school_samples-3.csv
4: sample_data/eight_school_samples-4.csv
/home/canyon/miniconda3/envs/arviz/lib/python3.6/site-packages/numba/compiler.py:602: NumbaPerformanceWarning:
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../arviz/utils.py", line 339:
@conditional_jit(parallel=True)
def full(shape, x):
^

  self.func_ir.loc))
/home/canyon/miniconda3/envs/arviz/lib/python3.6/site-packages/numba/compiler.py:602: NumbaPerformanceWarning:
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.

To find out why, try turning on parallel diagnostics, see http://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.

File "../../arviz/utils.py", line 339:
@conditional_jit(parallel=True)
def full(shape, x):
^

  self.func_ir.loc))
[27]:
Inference data with groups:
        > posterior
        > sample_stats
        > posterior_predictive
        > observed_data

From NumPyro

[28]:
from jax.random import PRNGKey

import numpyro
import numpyro.distributions as dist
from numpyro.distributions.transforms import AffineTransform
from numpyro.infer import MCMC, NUTS, Predictive

numpyro.set_host_device_count(4)

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

def model(J, sigma, y=None):
    mu = numpyro.sample('mu', dist.Normal(0, 5))
    tau = numpyro.sample('tau', dist.HalfCauchy(5))
    # use non-centered reparameterization
    theta = numpyro.sample('theta', dist.TransformedDistribution(
        dist.Normal(np.zeros(J), 1), AffineTransform(mu, tau)))
    numpyro.sample('y', dist.Normal(theta, sigma), obs=y)

kernel = NUTS(model)
mcmc = MCMC(kernel, num_warmup=500, num_samples=500, num_chains=4, chain_method='parallel')
mcmc.run(PRNGKey(0), **eight_school_data, extra_fields=['num_steps', 'energy'])
posterior_samples = mcmc.get_samples()
posterior_predictive = Predictive(model, posterior_samples).get_samples(
    PRNGKey(1), eight_school_data['J'], eight_school_data['sigma'])
prior = Predictive(model, num_samples=500).get_samples(
    PRNGKey(2), eight_school_data['J'], eight_school_data['sigma'])

numpyro_data = az.from_numpyro(mcmc, prior=prior, posterior_predictive=posterior_predictive,
                               coords={'school': np.arange(eight_school_data['J'])},
                               dims={'theta': ['school']})
numpyro_data
[28]:
Inference data with groups:
        > posterior
        > sample_stats
        > posterior_predictive
        > prior
        > observed_data