Inference Data Cookbook

InferenceData is the central data format for ArviZ. InferenceData itself is just a container that maintains references to one or more xarray.Dataset. Below are various ways to generate an InferenceData object. See here for more on xarray.

[1]:
import arviz as az
import numpy as np

From 1d numpy array

[2]:
size = 100
dataset = az.convert_to_inference_data(np.random.randn(size))
print(dataset)
dataset.posterior
Inference data with groups:
        > posterior
[2]:
<xarray.Dataset>
Dimensions:  (chain: 1, draw: 100)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
Data variables:
    x        (chain, draw) float64 1.084 -0.5994 -0.894 ... 2.022 2.334 0.0274
Attributes:
    created_at:  2019-06-21T16:57:30.377550

From nd numpy array

[3]:
shape = (1, 2, 3, 4, 5)
dataset = az.convert_to_inference_data(np.random.randn(*shape))
print(dataset)
dataset.posterior
Inference data with groups:
        > posterior
[3]:
<xarray.Dataset>
Dimensions:  (chain: 1, draw: 2, x_dim_0: 3, x_dim_1: 4, x_dim_2: 5)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1
  * x_dim_0  (x_dim_0) int64 0 1 2
  * x_dim_1  (x_dim_1) int64 0 1 2 3
  * x_dim_2  (x_dim_2) int64 0 1 2 3 4
Data variables:
    x        (chain, draw, x_dim_0, x_dim_1, x_dim_2) float64 -0.6808 ... -0.8528
Attributes:
    created_at:  2019-06-21T16:57:30.825192

From a dictionary

[4]:
datadict = {
    'a': np.random.randn(100),
    'b': np.random.randn(1, 100, 10),
    'c': np.random.randn(1, 100, 3, 4),
}
dataset = az.convert_to_inference_data(datadict)
print(dataset)
dataset.posterior
Inference data with groups:
        > posterior
[4]:
<xarray.Dataset>
Dimensions:  (b_dim_0: 10, c_dim_0: 3, c_dim_1: 4, chain: 1, draw: 100)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
  * b_dim_0  (b_dim_0) int64 0 1 2 3 4 5 6 7 8 9
  * c_dim_0  (c_dim_0) int64 0 1 2
  * c_dim_1  (c_dim_1) int64 0 1 2 3
Data variables:
    a        (chain, draw) float64 -1.25 -0.1914 0.08028 ... 0.5569 -0.7766
    b        (chain, draw, b_dim_0) float64 -0.8137 1.036 ... 0.2932 -0.8754
    c        (chain, draw, c_dim_0, c_dim_1) float64 0.32 -0.004118 ... -0.3479
Attributes:
    created_at:  2019-06-21T16:57:31.544505

From dictionary with coords and dims

[5]:
datadict = {
    'a': np.random.randn(100),
    'b': np.random.randn(1, 100, 10),
    'c': np.random.randn(1, 100, 3, 4),
}
coords = {
    'c1' : np.arange(3),
    'c2' : np.arange(4),
    'b1' : np.arange(10)
}
dims = {
    'b' : ['b1'],
    'c' : ['c1', 'c2']
}

dataset = az.convert_to_inference_data(
    datadict,
    coords=coords,
    dims=dims
)
print(dataset)
dataset.posterior
Inference data with groups:
        > posterior
[5]:
<xarray.Dataset>
Dimensions:  (b1: 10, c1: 3, c2: 4, chain: 1, draw: 100)
Coordinates:
  * chain    (chain) int64 0
  * draw     (draw) int64 0 1 2 3 4 5 6 7 8 9 ... 90 91 92 93 94 95 96 97 98 99
  * b1       (b1) int64 0 1 2 3 4 5 6 7 8 9
  * c1       (c1) int64 0 1 2
  * c2       (c2) int64 0 1 2 3
Data variables:
    a        (chain, draw) float64 1.697 -0.4425 0.3771 ... 1.135 -0.469 2.276
    b        (chain, draw, b1) float64 -0.2085 0.3442 0.2856 ... 0.2459 -1.886
    c        (chain, draw, c1, c2) float64 0.9242 -0.8723 ... -2.333 -0.5566
Attributes:
    created_at:  2019-06-21T16:57:32.498636

From pymc3

[6]:
import pymc3 as pm
draws = 500
chains = 2

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

with pm.Model() as model:
    mu = pm.Normal('mu', mu=0, sd=5)
    tau = pm.HalfCauchy('tau', beta=5)
    theta_tilde = pm.Normal('theta_tilde', mu=0, sd=1, shape=eight_school_data['J'])
    theta = pm.Deterministic('theta', mu + tau * theta_tilde)
    pm.Normal('obs', mu=theta, sd=eight_school_data['sigma'], observed=eight_school_data['y'])

    trace = pm.sample(draws, chains=chains)
    prior = pm.sample_prior_predictive()
    posterior_predictive = pm.sample_posterior_predictive(trace)

    pm_data = az.from_pymc3(
            trace=trace,
            prior=prior,
            posterior_predictive=posterior_predictive,
            coords={'school': np.arange(eight_school_data['J'])},
            dims={'theta': ['school'], 'theta_tilde': ['school']},
        )
pm_data
WARNING (theano.tensor.blas): Using NumPy C-API based implementation for BLAS functions.
Auto-assigning NUTS sampler...
Initializing NUTS using jitter+adapt_diag...
Multiprocess sampling (2 chains in 4 jobs)
NUTS: [theta_tilde, tau, mu]
Sampling 2 chains: 100%|██████████| 2000/2000 [00:00<00:00, 2197.77draws/s]
There were 3 divergences after tuning. Increase `target_accept` or reparameterize.
There was 1 divergence after tuning. Increase `target_accept` or reparameterize.
100%|██████████| 1000/1000 [00:00<00:00, 2134.56it/s]
[6]:
Inference data with groups:
    > posterior
    > sample_stats
    > posterior_predictive
    > prior
    > observed_data

From pystan

[7]:
import pystan

schools_code = """
data {
    int<lower=0> J;
    real y[J];
    real<lower=0> sigma[J];
}

parameters {
    real mu;
    real<lower=0> tau;
    real theta_tilde[J];
}

transformed parameters {
    real theta[J];
    for (j in 1:J)
        theta[j] = mu + tau * theta_tilde[j];
}

model {
    mu ~ normal(0, 5);
    tau ~ cauchy(0, 5);
    theta_tilde ~ normal(0, 1);
    y ~ normal(theta, sigma);
}

generated quantities {
    vector[J] log_lik;
    vector[J] y_hat;
    for (j in 1:J) {
        log_lik[j] = normal_lpdf(y[j] | theta[j], sigma[j]);
        y_hat[j] = normal_rng(theta[j], sigma[j]);
    }
}
"""

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

stan_model = pystan.StanModel(model_code=schools_code)
fit = stan_model.sampling(data=eight_school_data, control={"adapt_delta" : 0.9})

stan_data = az.from_pystan(
    posterior=fit,
    posterior_predictive='y_hat',
    observed_data=['y'],
    log_likelihood='log_lik',
    coords={'school': np.arange(eight_school_data['J'])},
    dims={
        'theta': ['school'],
        'y': ['school'],
        'log_lik': ['school'],
        'y_hat': ['school'],
        'theta_tilde': ['school']
    }
)

stan_data
[7]:
Inference data with groups:
    > posterior
    > sample_stats
    > posterior_predictive
    > observed_data

From pyro

[8]:
import torch

import pyro
import pyro.distributions as dist
import pyro.poutine as poutine
from pyro.infer.mcmc import MCMC, NUTS

pyro.enable_validation(True)
pyro.set_rng_seed(0)

draws = 1000
warmup_steps = 0
eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}


def model(sigma):
    eta = pyro.sample('eta', dist.Normal(torch.zeros(eight_school_data['J']), torch.ones(eight_school_data['J'])))
    mu = pyro.sample('mu', dist.Normal(torch.zeros(1), 10 * torch.ones(1)))
    tau = pyro.sample('tau', dist.HalfCauchy(scale=25 * torch.ones(1)))

    theta = mu + tau * eta

    return pyro.sample("obs", dist.Normal(theta, sigma))


def conditioned_model(model, sigma, y):
    return poutine.condition(model, data={"obs": y})(sigma)



nuts_kernel = NUTS(conditioned_model, adapt_step_size=True)
posterior = MCMC(nuts_kernel,
                 num_samples=draws,
                 warmup_steps=warmup_steps).run(model, eight_school_data['sigma'], eight_school_data['y'])

pyro_data = az.from_pyro(posterior)
pyro_data
INFO:pyro.infer.mcmc.mcmc:Starting MCMC using kernel - NUTS ...
INFO:pyro.infer.mcmc.mcmc:Iteration: 50 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.125390    Acceptance rate: 0.860000
INFO:pyro.infer.mcmc.mcmc:Iteration: 100 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.789698    Acceptance rate: 0.910000
INFO:pyro.infer.mcmc.mcmc:Iteration: 150 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.282786    Acceptance rate: 0.920000
INFO:pyro.infer.mcmc.mcmc:Iteration: 200 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.105557    Acceptance rate: 0.930000
INFO:pyro.infer.mcmc.mcmc:Iteration: 250 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.527265    Acceptance rate: 0.944000
INFO:pyro.infer.mcmc.mcmc:Iteration: 300 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.200404    Acceptance rate: 0.940000
INFO:pyro.infer.mcmc.mcmc:Iteration: 350 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.286618    Acceptance rate: 0.945714
INFO:pyro.infer.mcmc.mcmc:Iteration: 400 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.323607    Acceptance rate: 0.945000
INFO:pyro.infer.mcmc.mcmc:Iteration: 450 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.600277    Acceptance rate: 0.948889
INFO:pyro.infer.mcmc.mcmc:Iteration: 500 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.265936    Acceptance rate: 0.950000
INFO:pyro.infer.mcmc.mcmc:Iteration: 550 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.423345    Acceptance rate: 0.945455
INFO:pyro.infer.mcmc.mcmc:Iteration: 600 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.675079    Acceptance rate: 0.948333
INFO:pyro.infer.mcmc.mcmc:Iteration: 650 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.523993    Acceptance rate: 0.949231
INFO:pyro.infer.mcmc.mcmc:Iteration: 700 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.372428    Acceptance rate: 0.948571
INFO:pyro.infer.mcmc.mcmc:Iteration: 750 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.536313    Acceptance rate: 0.950667
INFO:pyro.infer.mcmc.mcmc:Iteration: 800 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.508929    Acceptance rate: 0.948750
INFO:pyro.infer.mcmc.mcmc:Iteration: 850 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.551862    Acceptance rate: 0.948235
INFO:pyro.infer.mcmc.mcmc:Iteration: 900 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.283344    Acceptance rate: 0.948889
INFO:pyro.infer.mcmc.mcmc:Iteration: 950 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.411292    Acceptance rate: 0.950526
INFO:pyro.infer.mcmc.mcmc:Iteration: 1000 [SAMPLE]
INFO:pyro.infer.mcmc.mcmc:Step size: 0.372683    Acceptance rate: 0.950000
[8]:
Inference data with groups:
    > posterior
    > observed_data

From emcee

[9]:
import emcee

eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}

def log_prior_8school(theta,J):
    mu = theta[0]
    tau = theta[1]
    eta = theta[2:]
    # Half-cauchy prior
    if tau<0:
        return -np.inf
    hwhm = 25
    prior_tau = -np.log(tau**2+hwhm**2)
    prior_mu = -(mu/10)**2  # normal prior, loc=0, scale=10
    prior_eta = -np.sum(eta**2)  # normal prior, loc=0, scale=1
    return prior_mu + prior_tau + prior_eta

def log_likelihood_8school(theta,y,sigma):
    mu = theta[0]
    tau = theta[1]
    eta = theta[2:]
    return -np.sum(((mu + tau * eta - y) / sigma)**2)

def lnprob_8school(theta,J,y,sigma):
    prior = log_prior_8school(theta,J)
    if prior <= -np.inf:
        return -np.inf
    like = log_likelihood_8school(theta,y,sigma)
    return like+prior

nwalkers = 40
ndim = eight_school_data['J']+2
draws = 1500
pos = np.random.normal(size=(nwalkers,ndim))
pos[:,1] = np.absolute(pos[:,1])
sampler = emcee.EnsembleSampler(nwalkers,
                                ndim,
                                lnprob_8school,
                                args=(eight_school_data['J'],
                                      eight_school_data['y'],
                                      eight_school_data['sigma']
                                     )
                               )
sampler.run_mcmc(pos, draws)

# define variable names, it cannot be inferred from emcee
var_names = ['mu','tau']+['eta{}'.format(i) for i in range(eight_school_data['J'])]
emcee_data = az.from_emcee(sampler, var_names = var_names)
emcee_data
[9]:
Inference data with groups:
    > posterior
    > observed_data

From CmdStanPy

[10]:
from cmdstanpy import Model, StanFit

schools_code = """
data {
    int<lower=0> J;
    real y[J];
    real<lower=0> sigma[J];
}

parameters {
    real mu;
    real<lower=0> tau;
    real theta_tilde[J];
}

transformed parameters {
    real theta[J];
    for (j in 1:J)
        theta[j] = mu + tau * theta_tilde[j];
}

model {
    mu ~ normal(0, 5);
    tau ~ cauchy(0, 5);
    theta_tilde ~ normal(0, 1);
    y ~ normal(theta, sigma);
}

generated quantities {
    vector[J] log_lik;
    vector[J] y_hat;
    for (j in 1:J) {
        log_lik[j] = normal_lpdf(y[j] | theta[j], sigma[j]);
        y_hat[j] = normal_rng(theta[j], sigma[j]);
    }
}
"""

with open("./eight_school.stan", "w") as f:
    print(schools_code, file=f)

stan_file = "./eight_school.stan"
stan_model = Model(stan_file=stan_file)
stan_model.compile()

eight_school_data = {'J': 8,
                     'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
                     'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
                    }

stan_fit = stan_model.sample(data=eight_school_data)

cmdstanpy_data = az.from_cmdstanpy(
    posterior=stan_fit,
    posterior_predictive='y_hat',
    observed_data={'y' : eight_school_data["y"]},
    log_likelihood='log_lik',
    coords={'school': np.arange(eight_school_data['J'])},
    dims={
        'theta': ['school'],
        'y': ['school'],
        'log_lik': ['school'],
        'y_hat': ['school'],
        'theta_tilde': ['school']
        }
)
cmdstanpy_data
INFO:cmdstanpy:stan to c++ (eight_school.hpp)
INFO:cmdstanpy:compiling c++
INFO:cmdstanpy:compiled model file: ./eight_school
INFO:cmdstanpy:start chain 1
INFO:cmdstanpy:start chain 2
INFO:cmdstanpy:finish chain 2
INFO:cmdstanpy:start chain 3
INFO:cmdstanpy:finish chain 1
INFO:cmdstanpy:start chain 4
INFO:cmdstanpy:finish chain 3
INFO:cmdstanpy:finish chain 4
[10]:
Inference data with groups:
    > posterior
    > sample_stats
    > posterior_predictive
    > observed_data

From CmdStan

See from_cmdstan for details.

CmdStan helpers

[11]:
# save for CmdStan example (needs CmdStanPy run)
stan_fit.save_csvfiles(dir="sample_data", basename="eight_school_samples")
[12]:
schools_code = """
data {
    int<lower=0> J;
    real y[J];
    real<lower=0> sigma[J];
}

parameters {
    real mu;
    real<lower=0> tau;
    real theta_tilde[J];
}

transformed parameters {
    real theta[J];
    for (j in 1:J)
        theta[j] = mu + tau * theta_tilde[j];
}

model {
    mu ~ normal(0, 5);
    tau ~ cauchy(0, 5);
    theta_tilde ~ normal(0, 1);
    y ~ normal(theta, sigma);
}

generated quantities {
    vector[J] log_lik;
    vector[J] y_hat;
    for (j in 1:J) {
        log_lik[j] = normal_lpdf(y[j] | theta[j], sigma[j]);
        y_hat[j] = normal_rng(theta[j], sigma[j]);
    }
}
"""

with open("./eight_school.stan", "w") as f:
    print(schools_code, file=f)
[13]:
eight_school_data = {
    'J': 8,
    'y': np.array([28., 8., -3., 7., -1., 1., 18., 12.]),
    'sigma': np.array([15., 10., 16., 11., 9., 11., 10., 18.])
}
[14]:
import pystan
pystan.stan_rdump(eight_school_data, "./eight_school.data.R")
[15]:
# Bash shell
#
# $ cd cmdstan
# $ make build
# $ make path/to/eight_school
# $ cd path/to
# $ for i in {1..4}
#   do
#     ./eight_school sample random seed=12345 \
#       id=$i data file=eight_school.data.R \
#       output file=sample_data/eight_school_samples-$i.csv &
#   done
# $
[16]:
# Let's use .stan and .csv files created/saved by the CmdStanPy procedure

# glob string
posterior_glob =  "sample_data/eight_school_samples-[0-9].csv"
# list of paths
# posterior_list =  [
#     "sample_data/eight_school_samples-1.csv",
#     "sample_data/eight_school_samples-2.csv",
#     "sample_data/eight_school_samples-3.csv",
#     "sample_data/eight_school_samples-4.csv",
# ]

obs_data_path = "./eight_school.data.R"

cmdstan_data = az.from_cmdstan(
    posterior = posterior_glob,
    posterior_predictive='y_hat',
    observed_data=obs_data_path,
    observed_data_var="y",
    log_likelihood='log_lik',
    coords={'school': np.arange(eight_school_data['J'])},
    dims={'theta': ['school'],
          'y': ['school'],
          'log_lik': ['school'],
          'y_hat': ['school'],
          'theta_tilde': ['school']
         }
)
cmdstan_data
arviz.data.io_cmdstan - INFO - glob found 4 files for 'posterior':
1: sample_data/eight_school_samples-1.csv
2: sample_data/eight_school_samples-2.csv
3: sample_data/eight_school_samples-3.csv
4: sample_data/eight_school_samples-4.csv
[16]:
Inference data with groups:
    > posterior
    > sample_stats
    > posterior_predictive
    > observed_data
[ ]: