Skip to content

Inference

Training algorithms

NPE_A

Bases: PosteriorEstimator

Source code in sbi/inference/trainers/npe/npe_a.py
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
class NPE_A(PosteriorEstimator):
    def __init__(
        self,
        prior: Optional[Distribution] = None,
        density_estimator: Union[str, Callable] = "mdn_snpe_a",
        num_components: int = 10,
        device: str = "cpu",
        logging_level: Union[int, str] = "WARNING",
        summary_writer: Optional[TensorboardSummaryWriter] = None,
        show_progress_bars: bool = True,
    ):
        r"""NPE-A [1].

        [1] _Fast epsilon-free Inference of Simulation Models with Bayesian Conditional
            Density Estimation_, Papamakarios et al., NeurIPS 2016,
            https://arxiv.org/abs/1605.06376.

        Like all NPE methods, this method trains a deep neural density estimator to
        directly approximate the posterior. Also like all other NPE methods, in the
        first round, this density estimator is trained with a maximum-likelihood loss.

        This class implements NPE-A. NPE-A trains across multiple rounds with a
        maximum-likelihood-loss. This will make training converge to the proposal
        posterior instead of the true posterior. To correct for this, SNPE-A applies a
        post-hoc correction after training. This correction has to be performed
        analytically. Thus, NPE-A is limited to Gaussian distributions for all but the
        last round. In the last round, NPE-A can use a Mixture of Gaussians.

        Args:
            prior: A probability distribution that expresses prior knowledge about the
                parameters, e.g. which ranges are meaningful for them. Any
                object with `.log_prob()`and `.sample()` (for example, a PyTorch
                distribution) can be used.
            density_estimator: If it is a string (only "mdn_snpe_a" is valid), use a
                pre-configured mixture of densities network. Alternatively, a function
                that builds a custom neural network can be provided. The function will
                be called with the first batch of simulations (theta, x), which can
                thus be used for shape inference and potentially for z-scoring. It
                needs to return a PyTorch `nn.Module` implementing the density
                estimator. The density estimator needs to provide the methods
                `.log_prob` and `.sample()`. Note that until the last round only a
                single (multivariate) Gaussian component is used for training (see
                Algorithm 1 in [1]). In the last round, this component is replicated
                `num_components` times, its parameters are perturbed with a very small
                noise, and then the last training round is done with the expanded
                Gaussian mixture as estimator for the proposal posterior.
            num_components: Number of components of the mixture of Gaussians in the
                last round. This overrides the `num_components` value passed to
                `posterior_nn()`.
            device: Training device, e.g., "cpu", "cuda" or "cuda:{0, 1, ...}".
            logging_level: Minimum severity of messages to log. One of the strings
                INFO, WARNING, DEBUG, ERROR and CRITICAL.
            summary_writer: A tensorboard `SummaryWriter` to control, among others, log
                file location (default is `<current working directory>/logs`.)
            show_progress_bars: Whether to show a progressbar during training.
        """

        # Catch invalid inputs.
        if not ((density_estimator == "mdn_snpe_a") or callable(density_estimator)):
            raise TypeError(
                "The `density_estimator` passed to SNPE_A needs to be a "
                "callable or the string 'mdn_snpe_a'!"
            )

        # `num_components` will be used to replicate the Gaussian in the last round.
        self._num_components = num_components
        self._ran_final_round = False

        # WARNING: sneaky trick ahead. We proxy the parent's `train` here,
        # requiring the signature to have `num_atoms`, save it for use below, and
        # continue. It's sneaky because we are using the object (self) as a namespace
        # to pass arguments between functions, and that's implicit state management.
        kwargs = del_entries(
            locals(),
            entries=("self", "__class__", "num_components"),
        )
        super().__init__(**kwargs)

    def train(
        self,
        final_round: bool = False,
        training_batch_size: int = 200,
        learning_rate: float = 5e-4,
        validation_fraction: float = 0.1,
        stop_after_epochs: int = 20,
        max_num_epochs: int = 2**31 - 1,
        clip_max_norm: Optional[float] = 5.0,
        calibration_kernel: Optional[Callable] = None,
        resume_training: bool = False,
        retrain_from_scratch: bool = False,
        show_train_summary: bool = False,
        dataloader_kwargs: Optional[Dict] = None,
        component_perturbation: float = 5e-3,
    ) -> ConditionalDensityEstimator:
        r"""Return density estimator that approximates the proposal posterior.

        [1] _Fast epsilon-free Inference of Simulation Models with Bayesian Conditional
            Density Estimation_, Papamakarios et al., NeurIPS 2016,
            https://arxiv.org/abs/1605.06376.

        Training is performed with maximum likelihood on samples from the latest round,
        which leads the algorithm to converge to the proposal posterior.

        Args:
            final_round: Whether we are in the last round of training or not. For all
                but the last round, Algorithm 1 from [1] is executed. In last the
                round, Algorithm 2 from [1] is executed once.
            training_batch_size: Training batch size.
            learning_rate: Learning rate for Adam optimizer.
            validation_fraction: The fraction of data to use for validation.
            stop_after_epochs: The number of epochs to wait for improvement on the
                validation set before terminating training.
            max_num_epochs: Maximum number of epochs to run. If reached, we stop
                training even when the validation loss is still decreasing. Otherwise,
                we train until validation loss increases (see also `stop_after_epochs`).
            clip_max_norm: Value at which to clip the total gradient norm in order to
                prevent exploding gradients. Use None for no clipping.
            calibration_kernel: A function to calibrate the loss with respect to the
                simulations `x`. See Lueckmann, Gonçalves et al., NeurIPS 2017.
            resume_training: Can be used in case training time is limited, e.g. on a
                cluster. If `True`, the split between train and validation set, the
                optimizer, the number of epochs, and the best validation log-prob will
                be restored from the last time `.train()` was called.
            force_first_round_loss: If `True`, train with maximum likelihood,
                i.e., potentially ignoring the correction for using a proposal
                distribution different from the prior.
            retrain_from_scratch: Whether to retrain the conditional density
                estimator for the posterior from scratch each round. Not supported for
                SNPE-A.
            show_train_summary: Whether to print the number of epochs and validation
                loss and leakage after the training.
            dataloader_kwargs: Additional or updated kwargs to be passed to the training
                and validation dataloaders (like, e.g., a collate_fn)
            component_perturbation: The standard deviation applied to all weights and
                biases when, in the last round, the Mixture of Gaussians is build from
                a single Gaussian. This value can be problem-specific and also depends
                on the number of mixture components.

        Returns:
            Density estimator that approximates the distribution $p(\theta|x)$.
        """

        assert not retrain_from_scratch, """Retraining from scratch is not supported in
            SNPE-A yet. The reason for this is that, if we reininitialized the density
            estimator, the z-scoring would change, which would break the posthoc
            correction. This is a pure implementation issue."""

        kwargs = del_entries(
            locals(),
            entries=(
                "self",
                "__class__",
                "final_round",
                "component_perturbation",
            ),
        )

        # SNPE-A always discards the prior samples.
        kwargs["discard_prior_samples"] = True
        kwargs["force_first_round_loss"] = True

        self._round = max(self._data_round_index)

        if final_round:
            # If there is (will be) only one round, train with Algorithm 2 from [1].
            if self._round == 0:
                self._build_neural_net = partial(
                    self._build_neural_net, num_components=self._num_components
                )
            # Run Algorithm 2 from [1].
            elif not self._ran_final_round:
                # Now switch to the specified number of components. This method will
                # only be used if `retrain_from_scratch=True`. Otherwise,
                # the MDN will be built from replicating the single-component net for
                # `num_component` times (via `_expand_mog()`).
                self._build_neural_net = partial(
                    self._build_neural_net, num_components=self._num_components
                )

                # Extend the MDN to the originally desired number of components.
                self._expand_mog(eps=component_perturbation)
            else:
                warnings.warn(
                    "You have already run SNPE-A with `final_round=True`. Running it"
                    "again with this setting will not allow computing the posthoc"
                    "correction applied in SNPE-A. Thus, you will get an error when "
                    "calling `.build_posterior()` after training.",
                    UserWarning,
                    stacklevel=2,
                )
        else:
            # Run Algorithm 1 from [1].
            # Wrap the function that builds the MDN such that we can make
            # sure that there is only one component when running.
            self._build_neural_net = partial(self._build_neural_net, num_components=1)

        if final_round:
            self._ran_final_round = True

        return super().train(**kwargs)

    def correct_for_proposal(
        self,
        density_estimator: Optional[TorchModule] = None,
    ) -> "NPE_A_MDN":
        r"""Build mixture of Gaussians that approximates the posterior.

        Returns a `NPE_A_MDN` object, which applies the posthoc-correction required in
        SNPE-A.

        Args:
            density_estimator: The density estimator that the posterior is based on.
                If `None`, use the latest neural density estimator that was trained.

        Returns:
            Posterior $p(\theta|x)$  with `.sample()` and `.log_prob()` methods.
        """
        if density_estimator is None:
            density_estimator = deepcopy(
                self._neural_net
            )  # PosteriorEstimator.train() also returns a deepcopy, mimic this here
            # If internal net is used device is defined.
            device = self._device
        else:
            # Otherwise, infer it from the device of the net parameters.
            device = str(next(density_estimator.parameters()).device)

        # Set proposal of the density estimator.
        # This also evokes the z-scoring correction if necessary.
        if (
            self._proposal_roundwise[-1] is self._prior
            or self._proposal_roundwise[-1] is None
        ):
            proposal = self._prior
            assert isinstance(
                proposal, (MultivariateNormal, BoxUniform)
            ), """Prior must be `torch.distributions.MultivariateNormal` or `sbi.utils.
                BoxUniform`"""
        else:
            assert isinstance(
                self._proposal_roundwise[-1], DirectPosterior
            ), """The proposal you passed to `append_simulations` is neither the prior
                nor a `DirectPosterior`. SNPE-A currently only supports these scenarios.
                """
            proposal = self._proposal_roundwise[-1]

        # Create the NPE_A_MDN
        wrapped_density_estimator = NPE_A_MDN(
            flow=density_estimator,  # type: ignore
            proposal=proposal,
            prior=self._prior,
            device=device,
        )
        return wrapped_density_estimator

    def build_posterior(
        self,
        density_estimator: Optional[TorchModule] = None,
        prior: Optional[Distribution] = None,
        **kwargs,
    ) -> "DirectPosterior":
        r"""Build posterior from the neural density estimator.

        This method first corrects the estimated density with `correct_for_proposal`
        and then returns a `DirectPosterior`.

        Args:
            density_estimator: The density estimator that the posterior is based on.
                If `None`, use the latest neural density estimator that was trained.
            prior: Prior distribution.

        Returns:
            Posterior $p(\theta|x)$  with `.sample()` and `.log_prob()` methods.
        """
        if prior is None:
            assert (
                self._prior is not None
            ), """You did not pass a prior. You have to pass the prior either at
                initialization `inference = SNPE_A(prior)` or to `.build_posterior
                (prior=prior)`."""
            prior = self._prior

        wrapped_density_estimator = self.correct_for_proposal(
            density_estimator=density_estimator
        )
        self._posterior = super().build_posterior(
            density_estimator=wrapped_density_estimator,
            prior=prior,
            **kwargs,
        )
        return deepcopy(self._posterior)  # type: ignore

    def _log_prob_proposal_posterior(
        self,
        theta: Tensor,
        x: Tensor,
        masks: Tensor,
        proposal: Optional[Any],
    ) -> Tensor:
        """Return the log-probability of the proposal posterior.

        For SNPE-A this is the same as `self._neural_net.log_prob(theta, x)` in
        `_loss()` to be found in `snpe_base.py`.

        Args:
            theta: Batch of parameters θ.
            x: Batch of data.
            masks: Mask that is True for prior samples in the batch in order to train
                them with prior loss.
            proposal: Proposal distribution.

        Returns: Log-probability of the proposal posterior.
        """
        return self._neural_net.log_prob(theta, x)

    def _expand_mog(self, eps: float = 1e-5):
        """
        Replicate a singe Gaussian trained with Algorithm 1 before continuing
        with Algorithm 2. The weights and biases of the associated MDN layers
        are repeated `num_components` times, slightly perturbed to break the
        symmetry such that the gradients in the subsequent training are not
        all identical.

        Args:
            eps: Standard deviation for the random perturbation.
        """
        assert isinstance(self._neural_net.net._distribution, MultivariateGaussianMDN)

        # Increase the number of components
        self._neural_net.net._distribution._num_components = self._num_components

        # Expand the 1-dim Gaussian.
        for name, param in self._neural_net.named_parameters():
            if any(
                key in name for key in ["logits", "means", "unconstrained", "upper"]
            ):
                if "bias" in name:
                    param.data = param.data.repeat(self._num_components)
                    param.data.add_(torch.randn_like(param.data) * eps)
                    param.grad = None  # let autograd construct a new gradient
                elif "weight" in name:
                    param.data = param.data.repeat(self._num_components, 1)
                    param.data.add_(torch.randn_like(param.data) * eps)
                    param.grad = None  # let autograd construct a new gradient

__init__(prior=None, density_estimator='mdn_snpe_a', num_components=10, device='cpu', logging_level='WARNING', summary_writer=None, show_progress_bars=True)

NPE-A [1].

[1] Fast epsilon-free Inference of Simulation Models with Bayesian Conditional Density Estimation, Papamakarios et al., NeurIPS 2016, https://arxiv.org/abs/1605.06376.

Like all NPE methods, this method trains a deep neural density estimator to directly approximate the posterior. Also like all other NPE methods, in the first round, this density estimator is trained with a maximum-likelihood loss.

This class implements NPE-A. NPE-A trains across multiple rounds with a maximum-likelihood-loss. This will make training converge to the proposal posterior instead of the true posterior. To correct for this, SNPE-A applies a post-hoc correction after training. This correction has to be performed analytically. Thus, NPE-A is limited to Gaussian distributions for all but the last round. In the last round, NPE-A can use a Mixture of Gaussians.

Parameters:

Name Type Description Default
prior Optional[Distribution]

A probability distribution that expresses prior knowledge about the parameters, e.g. which ranges are meaningful for them. Any object with .log_prob()and .sample() (for example, a PyTorch distribution) can be used.

None
density_estimator Union[str, Callable]

If it is a string (only “mdn_snpe_a” is valid), use a pre-configured mixture of densities network. Alternatively, a function that builds a custom neural network can be provided. The function will be called with the first batch of simulations (theta, x), which can thus be used for shape inference and potentially for z-scoring. It needs to return a PyTorch nn.Module implementing the density estimator. The density estimator needs to provide the methods .log_prob and .sample(). Note that until the last round only a single (multivariate) Gaussian component is used for training (see Algorithm 1 in [1]). In the last round, this component is replicated num_components times, its parameters are perturbed with a very small noise, and then the last training round is done with the expanded Gaussian mixture as estimator for the proposal posterior.

'mdn_snpe_a'
num_components int

Number of components of the mixture of Gaussians in the last round. This overrides the num_components value passed to posterior_nn().

10
device str

Training device, e.g., “cpu”, “cuda” or “cuda:{0, 1, …}”.

'cpu'
logging_level Union[int, str]

Minimum severity of messages to log. One of the strings INFO, WARNING, DEBUG, ERROR and CRITICAL.

'WARNING'
summary_writer Optional[TensorboardSummaryWriter]

A tensorboard SummaryWriter to control, among others, log file location (default is <current working directory>/logs.)

None
show_progress_bars bool

Whether to show a progressbar during training.

True
Source code in sbi/inference/trainers/npe/npe_a.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def __init__(
    self,
    prior: Optional[Distribution] = None,
    density_estimator: Union[str, Callable] = "mdn_snpe_a",
    num_components: int = 10,
    device: str = "cpu",
    logging_level: Union[int, str] = "WARNING",
    summary_writer: Optional[TensorboardSummaryWriter] = None,
    show_progress_bars: bool = True,
):
    r"""NPE-A [1].

    [1] _Fast epsilon-free Inference of Simulation Models with Bayesian Conditional
        Density Estimation_, Papamakarios et al., NeurIPS 2016,
        https://arxiv.org/abs/1605.06376.

    Like all NPE methods, this method trains a deep neural density estimator to
    directly approximate the posterior. Also like all other NPE methods, in the
    first round, this density estimator is trained with a maximum-likelihood loss.

    This class implements NPE-A. NPE-A trains across multiple rounds with a
    maximum-likelihood-loss. This will make training converge to the proposal
    posterior instead of the true posterior. To correct for this, SNPE-A applies a
    post-hoc correction after training. This correction has to be performed
    analytically. Thus, NPE-A is limited to Gaussian distributions for all but the
    last round. In the last round, NPE-A can use a Mixture of Gaussians.

    Args:
        prior: A probability distribution that expresses prior knowledge about the
            parameters, e.g. which ranges are meaningful for them. Any
            object with `.log_prob()`and `.sample()` (for example, a PyTorch
            distribution) can be used.
        density_estimator: If it is a string (only "mdn_snpe_a" is valid), use a
            pre-configured mixture of densities network. Alternatively, a function
            that builds a custom neural network can be provided. The function will
            be called with the first batch of simulations (theta, x), which can
            thus be used for shape inference and potentially for z-scoring. It
            needs to return a PyTorch `nn.Module` implementing the density
            estimator. The density estimator needs to provide the methods
            `.log_prob` and `.sample()`. Note that until the last round only a
            single (multivariate) Gaussian component is used for training (see
            Algorithm 1 in [1]). In the last round, this component is replicated
            `num_components` times, its parameters are perturbed with a very small
            noise, and then the last training round is done with the expanded
            Gaussian mixture as estimator for the proposal posterior.
        num_components: Number of components of the mixture of Gaussians in the
            last round. This overrides the `num_components` value passed to
            `posterior_nn()`.
        device: Training device, e.g., "cpu", "cuda" or "cuda:{0, 1, ...}".
        logging_level: Minimum severity of messages to log. One of the strings
            INFO, WARNING, DEBUG, ERROR and CRITICAL.
        summary_writer: A tensorboard `SummaryWriter` to control, among others, log
            file location (default is `<current working directory>/logs`.)
        show_progress_bars: Whether to show a progressbar during training.
    """

    # Catch invalid inputs.
    if not ((density_estimator == "mdn_snpe_a") or callable(density_estimator)):
        raise TypeError(
            "The `density_estimator` passed to SNPE_A needs to be a "
            "callable or the string 'mdn_snpe_a'!"
        )

    # `num_components` will be used to replicate the Gaussian in the last round.
    self._num_components = num_components
    self._ran_final_round = False

    # WARNING: sneaky trick ahead. We proxy the parent's `train` here,
    # requiring the signature to have `num_atoms`, save it for use below, and
    # continue. It's sneaky because we are using the object (self) as a namespace
    # to pass arguments between functions, and that's implicit state management.
    kwargs = del_entries(
        locals(),
        entries=("self", "__class__", "num_components"),
    )
    super().__init__(**kwargs)

build_posterior(density_estimator=None, prior=None, **kwargs)

Build posterior from the neural density estimator.

This method first corrects the estimated density with correct_for_proposal and then returns a DirectPosterior.

Parameters:

Name Type Description Default
density_estimator Optional[TorchModule]

The density estimator that the posterior is based on. If None, use the latest neural density estimator that was trained.

None
prior Optional[Distribution]

Prior distribution.

None

Returns:

Type Description
DirectPosterior

Posterior \(p(\theta|x)\) with .sample() and .log_prob() methods.

Source code in sbi/inference/trainers/npe/npe_a.py
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
def build_posterior(
    self,
    density_estimator: Optional[TorchModule] = None,
    prior: Optional[Distribution] = None,
    **kwargs,
) -> "DirectPosterior":
    r"""Build posterior from the neural density estimator.

    This method first corrects the estimated density with `correct_for_proposal`
    and then returns a `DirectPosterior`.

    Args:
        density_estimator: The density estimator that the posterior is based on.
            If `None`, use the latest neural density estimator that was trained.
        prior: Prior distribution.

    Returns:
        Posterior $p(\theta|x)$  with `.sample()` and `.log_prob()` methods.
    """
    if prior is None:
        assert (
            self._prior is not None
        ), """You did not pass a prior. You have to pass the prior either at
            initialization `inference = SNPE_A(prior)` or to `.build_posterior
            (prior=prior)`."""
        prior = self._prior

    wrapped_density_estimator = self.correct_for_proposal(
        density_estimator=density_estimator
    )
    self._posterior = super().build_posterior(
        density_estimator=wrapped_density_estimator,
        prior=prior,
        **kwargs,
    )
    return deepcopy(self._posterior)  # type: ignore

correct_for_proposal(density_estimator=None)

Build mixture of Gaussians that approximates the posterior.

Returns a NPE_A_MDN object, which applies the posthoc-correction required in SNPE-A.

Parameters:

Name Type Description Default
density_estimator Optional[TorchModule]

The density estimator that the posterior is based on. If None, use the latest neural density estimator that was trained.

None

Returns:

Type Description
NPE_A_MDN

Posterior \(p(\theta|x)\) with .sample() and .log_prob() methods.

Source code in sbi/inference/trainers/npe/npe_a.py
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
def correct_for_proposal(
    self,
    density_estimator: Optional[TorchModule] = None,
) -> "NPE_A_MDN":
    r"""Build mixture of Gaussians that approximates the posterior.

    Returns a `NPE_A_MDN` object, which applies the posthoc-correction required in
    SNPE-A.

    Args:
        density_estimator: The density estimator that the posterior is based on.
            If `None`, use the latest neural density estimator that was trained.

    Returns:
        Posterior $p(\theta|x)$  with `.sample()` and `.log_prob()` methods.
    """
    if density_estimator is None:
        density_estimator = deepcopy(
            self._neural_net
        )  # PosteriorEstimator.train() also returns a deepcopy, mimic this here
        # If internal net is used device is defined.
        device = self._device
    else:
        # Otherwise, infer it from the device of the net parameters.
        device = str(next(density_estimator.parameters()).device)

    # Set proposal of the density estimator.
    # This also evokes the z-scoring correction if necessary.
    if (
        self._proposal_roundwise[-1] is self._prior
        or self._proposal_roundwise[-1] is None
    ):
        proposal = self._prior
        assert isinstance(
            proposal, (MultivariateNormal, BoxUniform)
        ), """Prior must be `torch.distributions.MultivariateNormal` or `sbi.utils.
            BoxUniform`"""
    else:
        assert isinstance(
            self._proposal_roundwise[-1], DirectPosterior
        ), """The proposal you passed to `append_simulations` is neither the prior
            nor a `DirectPosterior`. SNPE-A currently only supports these scenarios.
            """
        proposal = self._proposal_roundwise[-1]

    # Create the NPE_A_MDN
    wrapped_density_estimator = NPE_A_MDN(
        flow=density_estimator,  # type: ignore
        proposal=proposal,
        prior=self._prior,
        device=device,
    )
    return wrapped_density_estimator

train(final_round=False, training_batch_size=200, learning_rate=0.0005, validation_fraction=0.1, stop_after_epochs=20, max_num_epochs=2 ** 31 - 1, clip_max_norm=5.0, calibration_kernel=None, resume_training=False, retrain_from_scratch=False, show_train_summary=False, dataloader_kwargs=None, component_perturbation=0.005)

Return density estimator that approximates the proposal posterior.

[1] Fast epsilon-free Inference of Simulation Models with Bayesian Conditional Density Estimation, Papamakarios et al., NeurIPS 2016, https://arxiv.org/abs/1605.06376.

Training is performed with maximum likelihood on samples from the latest round, which leads the algorithm to converge to the proposal posterior.

Parameters:

Name Type Description Default
final_round bool

Whether we are in the last round of training or not. For all but the last round, Algorithm 1 from [1] is executed. In last the round, Algorithm 2 from [1] is executed once.

False
training_batch_size int

Training batch size.

200
learning_rate float

Learning rate for Adam optimizer.

0.0005
validation_fraction float

The fraction of data to use for validation.

0.1
stop_after_epochs int

The number of epochs to wait for improvement on the validation set before terminating training.

20
max_num_epochs int

Maximum number of epochs to run. If reached, we stop training even when the validation loss is still decreasing. Otherwise, we train until validation loss increases (see also stop_after_epochs).

2 ** 31 - 1
clip_max_norm Optional[float]

Value at which to clip the total gradient norm in order to prevent exploding gradients. Use None for no clipping.

5.0
calibration_kernel Optional[Callable]

A function to calibrate the loss with respect to the simulations x. See Lueckmann, Gonçalves et al., NeurIPS 2017.

None
resume_training bool

Can be used in case training time is limited, e.g. on a cluster. If True, the split between train and validation set, the optimizer, the number of epochs, and the best validation log-prob will be restored from the last time .train() was called.

False
force_first_round_loss

If True, train with maximum likelihood, i.e., potentially ignoring the correction for using a proposal distribution different from the prior.

required
retrain_from_scratch bool

Whether to retrain the conditional density estimator for the posterior from scratch each round. Not supported for SNPE-A.

False
show_train_summary bool

Whether to print the number of epochs and validation loss and leakage after the training.

False
dataloader_kwargs Optional[Dict]

Additional or updated kwargs to be passed to the training and validation dataloaders (like, e.g., a collate_fn)

None
component_perturbation float

The standard deviation applied to all weights and biases when, in the last round, the Mixture of Gaussians is build from a single Gaussian. This value can be problem-specific and also depends on the number of mixture components.

0.005

Returns:

Type Description
ConditionalDensityEstimator

Density estimator that approximates the distribution \(p(\theta|x)\).

Source code in sbi/inference/trainers/npe/npe_a.py
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
def train(
    self,
    final_round: bool = False,
    training_batch_size: int = 200,
    learning_rate: float = 5e-4,
    validation_fraction: float = 0.1,
    stop_after_epochs: int = 20,
    max_num_epochs: int = 2**31 - 1,
    clip_max_norm: Optional[float] = 5.0,
    calibration_kernel: Optional[Callable] = None,
    resume_training: bool = False,
    retrain_from_scratch: bool = False,
    show_train_summary: bool = False,
    dataloader_kwargs: Optional[Dict] = None,
    component_perturbation: float = 5e-3,
) -> ConditionalDensityEstimator:
    r"""Return density estimator that approximates the proposal posterior.

    [1] _Fast epsilon-free Inference of Simulation Models with Bayesian Conditional
        Density Estimation_, Papamakarios et al., NeurIPS 2016,
        https://arxiv.org/abs/1605.06376.

    Training is performed with maximum likelihood on samples from the latest round,
    which leads the algorithm to converge to the proposal posterior.

    Args:
        final_round: Whether we are in the last round of training or not. For all
            but the last round, Algorithm 1 from [1] is executed. In last the
            round, Algorithm 2 from [1] is executed once.
        training_batch_size: Training batch size.
        learning_rate: Learning rate for Adam optimizer.
        validation_fraction: The fraction of data to use for validation.
        stop_after_epochs: The number of epochs to wait for improvement on the
            validation set before terminating training.
        max_num_epochs: Maximum number of epochs to run. If reached, we stop
            training even when the validation loss is still decreasing. Otherwise,
            we train until validation loss increases (see also `stop_after_epochs`).
        clip_max_norm: Value at which to clip the total gradient norm in order to
            prevent exploding gradients. Use None for no clipping.
        calibration_kernel: A function to calibrate the loss with respect to the
            simulations `x`. See Lueckmann, Gonçalves et al., NeurIPS 2017.
        resume_training: Can be used in case training time is limited, e.g. on a
            cluster. If `True`, the split between train and validation set, the
            optimizer, the number of epochs, and the best validation log-prob will
            be restored from the last time `.train()` was called.
        force_first_round_loss: If `True`, train with maximum likelihood,
            i.e., potentially ignoring the correction for using a proposal
            distribution different from the prior.
        retrain_from_scratch: Whether to retrain the conditional density
            estimator for the posterior from scratch each round. Not supported for
            SNPE-A.
        show_train_summary: Whether to print the number of epochs and validation
            loss and leakage after the training.
        dataloader_kwargs: Additional or updated kwargs to be passed to the training
            and validation dataloaders (like, e.g., a collate_fn)
        component_perturbation: The standard deviation applied to all weights and
            biases when, in the last round, the Mixture of Gaussians is build from
            a single Gaussian. This value can be problem-specific and also depends
            on the number of mixture components.

    Returns:
        Density estimator that approximates the distribution $p(\theta|x)$.
    """

    assert not retrain_from_scratch, """Retraining from scratch is not supported in
        SNPE-A yet. The reason for this is that, if we reininitialized the density
        estimator, the z-scoring would change, which would break the posthoc
        correction. This is a pure implementation issue."""

    kwargs = del_entries(
        locals(),
        entries=(
            "self",
            "__class__",
            "final_round",
            "component_perturbation",
        ),
    )

    # SNPE-A always discards the prior samples.
    kwargs["discard_prior_samples"] = True
    kwargs["force_first_round_loss"] = True

    self._round = max(self._data_round_index)

    if final_round:
        # If there is (will be) only one round, train with Algorithm 2 from [1].
        if self._round == 0:
            self._build_neural_net = partial(
                self._build_neural_net, num_components=self._num_components
            )
        # Run Algorithm 2 from [1].
        elif not self._ran_final_round:
            # Now switch to the specified number of components. This method will
            # only be used if `retrain_from_scratch=True`. Otherwise,
            # the MDN will be built from replicating the single-component net for
            # `num_component` times (via `_expand_mog()`).
            self._build_neural_net = partial(
                self._build_neural_net, num_components=self._num_components
            )

            # Extend the MDN to the originally desired number of components.
            self._expand_mog(eps=component_perturbation)
        else:
            warnings.warn(
                "You have already run SNPE-A with `final_round=True`. Running it"
                "again with this setting will not allow computing the posthoc"
                "correction applied in SNPE-A. Thus, you will get an error when "
                "calling `.build_posterior()` after training.",
                UserWarning,
                stacklevel=2,
            )
    else:
        # Run Algorithm 1 from [1].
        # Wrap the function that builds the MDN such that we can make
        # sure that there is only one component when running.
        self._build_neural_net = partial(self._build_neural_net, num_components=1)

    if final_round:
        self._ran_final_round = True

    return super().train(**kwargs)

NPE_C

Bases: PosteriorEstimator

Source code in sbi/inference/trainers/npe/npe_c.py
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
class NPE_C(PosteriorEstimator):
    def __init__(
        self,
        prior: Optional[Distribution] = None,
        density_estimator: Union[str, Callable] = "maf",
        device: str = "cpu",
        logging_level: Union[int, str] = "WARNING",
        summary_writer: Optional[TensorboardSummaryWriter] = None,
        show_progress_bars: bool = True,
    ):
        r"""NPE-C / APT [1].

        [1] _Automatic Posterior Transformation for Likelihood-free Inference_,
            Greenberg et al., ICML 2019, https://arxiv.org/abs/1905.07488.

        Like all NPE methods, this method trains a deep neural density estimator to
        directly approximate the posterior. Also like all other NPE methods, in the
        first round, this density estimator is trained with a maximum-likelihood loss.

        For the sequential mode in which the density estimator is trained across rounds,
        this class implements two loss variants of NPE-C: the non-atomic and the atomic
        version. The atomic loss of NPE-C can be used for any density estimator,
        i.e. also for normalizing flows. However, it suffers from leakage issues. On
        the other hand, the non-atomic loss can only be used only if the proposal
        distribution is a mixture of Gaussians, the density estimator is a mixture of
        Gaussians, and the prior is either Gaussian or Uniform. It does not suffer from
        leakage issues. At the beginning of each round, we print whether the non-atomic
        or the atomic version is used.

        In this codebase, we will automatically switch to the non-atomic loss if the
        following criteria are fulfilled:<br/>
        - proposal is a `DirectPosterior` with density_estimator `mdn`, as built
            with `sbi.neural_nets.posterior_nn()`.<br/>
        - the density estimator is a `mdn`, as built with
            `sbi.neural_nets.posterior_nn()`.<br/>
        - `isinstance(prior, MultivariateNormal)` (from `torch.distributions`) or
            `isinstance(prior, sbi.utils.BoxUniform)`

        Note that custom implementations of any of these densities (or estimators) will
        not trigger the non-atomic loss, and the algorithm will fall back onto using
        the atomic loss.

        Args:
            prior: A probability distribution that expresses prior knowledge about the
                parameters, e.g. which ranges are meaningful for them.
            density_estimator: If it is a string, use a pre-configured network of the
                provided type (one of nsf, maf, mdn, made). Alternatively, a function
                that builds a custom neural network can be provided. The function will
                be called with the first batch of simulations (theta, x), which can
                thus be used for shape inference and potentially for z-scoring. It
                needs to return a PyTorch `nn.Module` implementing the density
                estimator. The density estimator needs to provide the methods
                `.log_prob` and `.sample()`.
            device: Training device, e.g., "cpu", "cuda" or "cuda:{0, 1, ...}".
            logging_level: Minimum severity of messages to log. One of the strings
                INFO, WARNING, DEBUG, ERROR and CRITICAL.
            summary_writer: A tensorboard `SummaryWriter` to control, among others, log
                file location (default is `<current working directory>/logs`.)
            show_progress_bars: Whether to show a progressbar during training.
        """

        kwargs = del_entries(locals(), entries=("self", "__class__"))
        super().__init__(**kwargs)

    def train(
        self,
        num_atoms: int = 10,
        training_batch_size: int = 200,
        learning_rate: float = 5e-4,
        validation_fraction: float = 0.1,
        stop_after_epochs: int = 20,
        max_num_epochs: int = 2**31 - 1,
        clip_max_norm: Optional[float] = 5.0,
        calibration_kernel: Optional[Callable] = None,
        resume_training: bool = False,
        force_first_round_loss: bool = False,
        discard_prior_samples: bool = False,
        use_combined_loss: bool = False,
        retrain_from_scratch: bool = False,
        show_train_summary: bool = False,
        dataloader_kwargs: Optional[Dict] = None,
    ) -> nn.Module:
        r"""Return density estimator that approximates the distribution $p(\theta|x)$.

        Args:
            num_atoms: Number of atoms to use for classification.
            training_batch_size: Training batch size.
            learning_rate: Learning rate for Adam optimizer.
            validation_fraction: The fraction of data to use for validation.
            stop_after_epochs: The number of epochs to wait for improvement on the
                validation set before terminating training.
            max_num_epochs: Maximum number of epochs to run. If reached, we stop
                training even when the validation loss is still decreasing. Otherwise,
                we train until validation loss increases (see also `stop_after_epochs`).
            clip_max_norm: Value at which to clip the total gradient norm in order to
                prevent exploding gradients. Use None for no clipping.
            calibration_kernel: A function to calibrate the loss with respect to the
                simulations `x`. See Lueckmann, Gonçalves et al., NeurIPS 2017.
            resume_training: Can be used in case training time is limited, e.g. on a
                cluster. If `True`, the split between train and validation set, the
                optimizer, the number of epochs, and the best validation log-prob will
                be restored from the last time `.train()` was called.
            force_first_round_loss: If `True`, train with maximum likelihood,
                i.e., potentially ignoring the correction for using a proposal
                distribution different from the prior.
            discard_prior_samples: Whether to discard samples simulated in round 1, i.e.
                from the prior. Training may be sped up by ignoring such less targeted
                samples.
            use_combined_loss: Whether to train the neural net also on prior samples
                using maximum likelihood in addition to training it on all samples using
                atomic loss. The extra MLE loss helps prevent density leaking with
                bounded priors.
            retrain_from_scratch: Whether to retrain the conditional density
                estimator for the posterior from scratch each round.
            show_train_summary: Whether to print the number of epochs and validation
                loss and leakage after the training.
            dataloader_kwargs: Additional or updated kwargs to be passed to the training
                and validation dataloaders (like, e.g., a collate_fn)

        Returns:
            Density estimator that approximates the distribution $p(\theta|x)$.
        """

        # WARNING: sneaky trick ahead. We proxy the parent's `train` here,
        # requiring the signature to have `num_atoms`, save it for use below, and
        # continue. It's sneaky because we are using the object (self) as a namespace
        # to pass arguments between functions, and that's implicit state management.
        self._num_atoms = num_atoms
        self._use_combined_loss = use_combined_loss
        kwargs = del_entries(
            locals(),
            entries=("self", "__class__", "num_atoms", "use_combined_loss"),
        )

        self._round = max(self._data_round_index)

        if self._round > 0:
            # Set the proposal to the last proposal that was passed by the user. For
            # atomic SNPE, it does not matter what the proposal is. For non-atomic
            # SNPE, we only use the latest data that was passed, i.e. the one from the
            # last proposal.
            proposal = self._proposal_roundwise[-1]
            self.use_non_atomic_loss = (
                isinstance(proposal, DirectPosterior)
                and isinstance(proposal.posterior_estimator.net._distribution, mdn)
                and isinstance(self._neural_net.net._distribution, mdn)
                and check_dist_class(
                    self._prior, class_to_check=(Uniform, MultivariateNormal)
                )[0]
            )

            algorithm = "non-atomic" if self.use_non_atomic_loss else "atomic"
            print(f"Using SNPE-C with {algorithm} loss")

            if self.use_non_atomic_loss:
                # Take care of z-scoring, pre-compute and store prior terms.
                self._set_state_for_mog_proposal()

        return super().train(**kwargs)

    def _set_state_for_mog_proposal(self) -> None:
        """Set state variables that are used at each training step of non-atomic SNPE-C.

        Three things are computed:
        1) Check if z-scoring was requested. To do so, we check if the `_transform`
            argument of the net had been a `CompositeTransform`. See pyknos mdn.py.
        2) Define a (potentially standardized) prior. It's standardized if z-scoring
            had been requested.
        3) Compute (Precision * mean) for the prior. This quantity is used at every
            training step if the prior is Gaussian.
        """

        self.z_score_theta = isinstance(
            self._neural_net.net._transform, CompositeTransform
        )

        self._set_maybe_z_scored_prior()

        if isinstance(self._maybe_z_scored_prior, MultivariateNormal):
            self.prec_m_prod_prior = torch.mv(
                self._maybe_z_scored_prior.precision_matrix,  # type: ignore
                self._maybe_z_scored_prior.loc,  # type: ignore
            )

    def _set_maybe_z_scored_prior(self) -> None:
        r"""Compute and store potentially standardized prior (if z-scoring was done).

        The proposal posterior is:
        $pp(\theta|x) = 1/Z * q(\theta|x) * prop(\theta) / p(\theta)$

        Let's denote z-scored theta by `a`: a = (theta - mean) / std
        Then pp'(a|x) = 1/Z_2 * q'(a|x) * prop'(a) / p'(a)$

        The ' indicates that the evaluation occurs in standardized space. The constant
        scaling factor has been absorbed into Z_2.
        From the above equation, we see that we need to evaluate the prior **in
        standardized space**. We build the standardized prior in this function.

        The standardize transform that is applied to the samples theta does not use
        the exact prior mean and std (due to implementation issues). Hence, the z-scored
        prior will not be exactly have mean=0 and std=1.
        """

        if self.z_score_theta:
            scale = self._neural_net.net._transform._transforms[0]._scale
            shift = self._neural_net.net._transform._transforms[0]._shift

            # Following the definintion of the linear transform in
            # `standardizing_transform` in `sbiutils.py`:
            # shift=-mean / std
            # scale=1 / std
            # Solving these equations for mean and std:
            estim_prior_std = 1 / scale
            estim_prior_mean = -shift * estim_prior_std

            # Compute the discrepancy of the true prior mean and std and the mean and
            # std that was empirically estimated from samples.
            # N(theta|m,s) = N((theta-m_e)/s_e|(m-m_e)/s_e, s/s_e)
            # Above: m,s are true prior mean and std. m_e,s_e are estimated prior mean
            # and std (estimated from samples and used to build standardize transform).
            almost_zero_mean = (self._prior.mean - estim_prior_mean) / estim_prior_std
            almost_one_std = torch.sqrt(self._prior.variance) / estim_prior_std

            if isinstance(self._prior, MultivariateNormal):
                self._maybe_z_scored_prior = MultivariateNormal(
                    almost_zero_mean, torch.diag(almost_one_std)
                )
            else:
                range_ = torch.sqrt(almost_one_std * 3.0)
                self._maybe_z_scored_prior = BoxUniform(
                    almost_zero_mean - range_, almost_zero_mean + range_
                )
        else:
            self._maybe_z_scored_prior = self._prior

    def _log_prob_proposal_posterior(
        self,
        theta: Tensor,
        x: Tensor,
        masks: Tensor,
        proposal: DirectPosterior,
    ) -> Tensor:
        """Return the log-probability of the proposal posterior.

        If the proposal is a MoG, the density estimator is a MoG, and the prior is
        either Gaussian or uniform, we use non-atomic loss. Else, use atomic loss (which
        suffers from leakage).

        Args:
            theta: Batch of parameters θ.
            x: Batch of data.
            masks: Mask that is True for prior samples in the batch in order to train
                them with prior loss.
            proposal: Proposal distribution.

        Returns: Log-probability of the proposal posterior.
        """

        if self.use_non_atomic_loss:
            if not (
                hasattr(self._neural_net.net, "_distribution")
                and isinstance(self._neural_net.net._distribution, mdn)
            ):
                raise ValueError(
                    "The density estimator must be a MDNtext for non-atomic loss."
                )

            return self._log_prob_proposal_posterior_mog(theta, x, proposal)
        else:
            if not hasattr(self._neural_net, "log_prob"):
                raise ValueError(
                    "The neural estimator must have a log_prob method, for\
                                 atomic loss. It should at best follow the \
                                 sbi.neural_nets 'DensityEstiamtor' interface."
                )
            return self._log_prob_proposal_posterior_atomic(theta, x, masks)

    def _log_prob_proposal_posterior_atomic(
        self, theta: Tensor, x: Tensor, masks: Tensor
    ):
        """Return log probability of the proposal posterior for atomic proposals.

        We have two main options when evaluating the proposal posterior.
            (1) Generate atoms from the proposal prior.
            (2) Generate atoms from a more targeted distribution, such as the most
                recent posterior.
        If we choose the latter, it is likely beneficial not to do this in the first
        round, since we would be sampling from a randomly-initialized neural density
        estimator.

        Args:
            theta: Batch of parameters θ.
            x: Batch of data.
            masks: Mask that is True for prior samples in the batch in order to train
                them with prior loss.

        Returns:
            Log-probability of the proposal posterior.
        """
        batch_size = theta.shape[0]

        num_atoms = int(
            clamp_and_warn("num_atoms", self._num_atoms, min_val=2, max_val=batch_size)
        )

        # Each set of parameter atoms is evaluated using the same x,
        # so we repeat rows of the data x, e.g. [1, 2] -> [1, 1, 2, 2]
        repeated_x = repeat_rows(x, num_atoms)

        # To generate the full set of atoms for a given item in the batch,
        # we sample without replacement num_atoms - 1 times from the rest
        # of the theta in the batch.
        probs = ones(batch_size, batch_size) * (1 - eye(batch_size)) / (batch_size - 1)

        choices = torch.multinomial(probs, num_samples=num_atoms - 1, replacement=False)
        contrasting_theta = theta[choices]

        # We can now create our sets of atoms from the contrasting parameter sets
        # we have generated.
        atomic_theta = torch.cat((theta[:, None, :], contrasting_theta), dim=1).reshape(
            batch_size * num_atoms, -1
        )

        # Get (batch_size * num_atoms) log prob prior evals.
        log_prob_prior = self._prior.log_prob(atomic_theta)
        log_prob_prior = log_prob_prior.reshape(batch_size, num_atoms)
        assert_all_finite(log_prob_prior, "prior eval")

        # Evaluate large batch giving (batch_size * num_atoms) log prob posterior evals.
        atomic_theta = reshape_to_sample_batch_event(
            atomic_theta, atomic_theta.shape[1:]
        )
        repeated_x = reshape_to_batch_event(
            repeated_x, self._neural_net.condition_shape
        )
        log_prob_posterior = self._neural_net.log_prob(atomic_theta, repeated_x)
        assert_all_finite(log_prob_posterior, "posterior eval")
        log_prob_posterior = log_prob_posterior.reshape(batch_size, num_atoms)

        # Compute unnormalized proposal posterior.
        unnormalized_log_prob = log_prob_posterior - log_prob_prior

        # Normalize proposal posterior across discrete set of atoms.
        log_prob_proposal_posterior = unnormalized_log_prob[:, 0] - torch.logsumexp(
            unnormalized_log_prob, dim=-1
        )
        assert_all_finite(log_prob_proposal_posterior, "proposal posterior eval")

        # XXX This evaluates the posterior on _all_ prior samples
        if self._use_combined_loss:
            theta = reshape_to_sample_batch_event(theta, self._neural_net.input_shape)
            x = reshape_to_batch_event(x, self._neural_net.condition_shape)
            log_prob_posterior_non_atomic = self._neural_net.log_prob(theta, x)
            # squeeze to remove sample dimension, which is always one during the loss
            # evaluation of `SNPE_C` (because we have one theta vector per x vector).
            log_prob_posterior_non_atomic = log_prob_posterior_non_atomic.squeeze(dim=0)
            masks = masks.reshape(-1)
            log_prob_proposal_posterior = (
                masks * log_prob_posterior_non_atomic + log_prob_proposal_posterior
            )

        return log_prob_proposal_posterior

    def _log_prob_proposal_posterior_mog(
        self, theta: Tensor, x: Tensor, proposal: DirectPosterior
    ) -> Tensor:
        """Return log-probability of the proposal posterior for MoG proposal.

        For MoG proposals and MoG density estimators, this can be done in closed form
        and does not require atomic loss (i.e. there will be no leakage issues).

        Notation:

        m are mean vectors.
        prec are precision matrices.
        cov are covariance matrices.

        _p at the end indicates that it is the proposal.
        _d indicates that it is the density estimator.
        _pp indicates the proposal posterior.

        All tensors will have shapes (batch_dim, num_components, ...)

        Args:
            theta: Batch of parameters θ.
            x: Batch of data.
            proposal: Proposal distribution.

        Returns:
            Log-probability of the proposal posterior.
        """

        # Evaluate the proposal. MDNs do not have functionality to run the embedding_net
        # and then get the mixture_components (**without** calling log_prob()). Hence,
        # we call them separately here.
        encoded_x = proposal.posterior_estimator.net._embedding_net(proposal.default_x)
        dist = (
            proposal.posterior_estimator.net._distribution
        )  # defined to avoid ugly black formatting.
        logits_p, m_p, prec_p, _, _ = dist.get_mixture_components(encoded_x)
        norm_logits_p = logits_p - torch.logsumexp(logits_p, dim=-1, keepdim=True)

        # Evaluate the density estimator.
        encoded_x = self._neural_net.net._embedding_net(x)
        dist = self._neural_net.net._distribution  # defined to avoid black formatting.
        logits_d, m_d, prec_d, _, _ = dist.get_mixture_components(encoded_x)
        norm_logits_d = logits_d - torch.logsumexp(logits_d, dim=-1, keepdim=True)

        # z-score theta if it z-scoring had been requested.
        theta = self._maybe_z_score_theta(theta)

        # Compute the MoG parameters of the proposal posterior.
        (
            logits_pp,
            m_pp,
            prec_pp,
            cov_pp,
        ) = self._automatic_posterior_transformation(
            norm_logits_p, m_p, prec_p, norm_logits_d, m_d, prec_d
        )

        # Compute the log_prob of theta under the product.
        log_prob_proposal_posterior = mog_log_prob(theta, logits_pp, m_pp, prec_pp)
        assert_all_finite(
            log_prob_proposal_posterior,
            """the evaluation of the MoG proposal posterior. This is likely due to a
            numerical instability in the training procedure. Please create an issue on
            Github.""",
        )

        return log_prob_proposal_posterior

    def _automatic_posterior_transformation(
        self,
        logits_p: Tensor,
        means_p: Tensor,
        precisions_p: Tensor,
        logits_d: Tensor,
        means_d: Tensor,
        precisions_d: Tensor,
    ):
        r"""Returns the MoG parameters of the proposal posterior.

        The proposal posterior is:
        $pp(\theta|x) = 1/Z * q(\theta|x) * prop(\theta) / p(\theta)$
        In words: proposal posterior = posterior estimate * proposal / prior.

        If the posterior estimate and the proposal are MoG and the prior is either
        Gaussian or uniform, we can solve this in closed-form. The is implemented in
        this function.

        This function implements Appendix A1 from Greenberg et al. 2019.

        We have to build L*K components. How do we do this?
        Example: proposal has two components, density estimator has three components.
        Let's call the two components of the proposal i,j and the three components
        of the density estimator x,y,z. We have to multiply every component of the
        proposal with every component of the density estimator. So, what we do is:
        1) for the proposal, build: i,i,i,j,j,j. Done with torch.repeat_interleave()
        2) for the density estimator, build: x,y,z,x,y,z. Done with torch.repeat()
        3) Multiply them with simple matrix operations.

        Args:
            logits_p: Component weight of each Gaussian of the proposal.
            means_p: Mean of each Gaussian of the proposal.
            precisions_p: Precision matrix of each Gaussian of the proposal.
            logits_d: Component weight for each Gaussian of the density estimator.
            means_d: Mean of each Gaussian of the density estimator.
            precisions_d: Precision matrix of each Gaussian of the density estimator.

        Returns: (Component weight, mean, precision matrix, covariance matrix) of each
            Gaussian of the proposal posterior. Has L*K terms (proposal has L terms,
            density estimator has K terms).
        """

        precisions_pp, covariances_pp = self._precisions_proposal_posterior(
            precisions_p, precisions_d
        )

        means_pp = self._means_proposal_posterior(
            covariances_pp, means_p, precisions_p, means_d, precisions_d
        )

        logits_pp = self._logits_proposal_posterior(
            means_pp,
            precisions_pp,
            covariances_pp,
            logits_p,
            means_p,
            precisions_p,
            logits_d,
            means_d,
            precisions_d,
        )

        return logits_pp, means_pp, precisions_pp, covariances_pp

    def _precisions_proposal_posterior(
        self, precisions_p: Tensor, precisions_d: Tensor
    ):
        """Return the precisions and covariances of the proposal posterior.

        Args:
            precisions_p: Precision matrices of the proposal distribution.
            precisions_d: Precision matrices of the density estimator.

        Returns: (Precisions, Covariances) of the proposal posterior. L*K terms.
        """

        num_comps_p = precisions_p.shape[1]
        num_comps_d = precisions_d.shape[1]

        precisions_p_rep = precisions_p.repeat_interleave(num_comps_d, dim=1)
        precisions_d_rep = precisions_d.repeat(1, num_comps_p, 1, 1)

        precisions_pp = precisions_p_rep + precisions_d_rep
        if isinstance(self._maybe_z_scored_prior, MultivariateNormal):
            precisions_pp -= self._maybe_z_scored_prior.precision_matrix

        covariances_pp = torch.inverse(precisions_pp)

        return precisions_pp, covariances_pp

    def _means_proposal_posterior(
        self,
        covariances_pp: Tensor,
        means_p: Tensor,
        precisions_p: Tensor,
        means_d: Tensor,
        precisions_d: Tensor,
    ):
        """Return the means of the proposal posterior.

        means_pp = C_ix * (P_i * m_i + P_x * m_x - P_o * m_o).

        Args:
            covariances_pp: Covariance matrices of the proposal posterior.
            means_p: Means of the proposal distribution.
            precisions_p: Precision matrices of the proposal distribution.
            means_d: Means of the density estimator.
            precisions_d: Precision matrices of the density estimator.

        Returns: Means of the proposal posterior. L*K terms.
        """

        num_comps_p = precisions_p.shape[1]
        num_comps_d = precisions_d.shape[1]

        # First, compute the product P_i * m_i and P_j * m_j
        prec_m_prod_p = batched_mixture_mv(precisions_p, means_p)
        prec_m_prod_d = batched_mixture_mv(precisions_d, means_d)

        # Repeat them to allow for matrix operations: same trick as for the precisions.
        prec_m_prod_p_rep = prec_m_prod_p.repeat_interleave(num_comps_d, dim=1)
        prec_m_prod_d_rep = prec_m_prod_d.repeat(1, num_comps_p, 1)

        # Means = C_ij * (P_i * m_i + P_x * m_x - P_o * m_o).
        summed_cov_m_prod_rep = prec_m_prod_p_rep + prec_m_prod_d_rep
        if isinstance(self._maybe_z_scored_prior, MultivariateNormal):
            summed_cov_m_prod_rep -= self.prec_m_prod_prior

        means_pp = batched_mixture_mv(covariances_pp, summed_cov_m_prod_rep)

        return means_pp

    @staticmethod
    def _logits_proposal_posterior(
        means_pp: Tensor,
        precisions_pp: Tensor,
        covariances_pp: Tensor,
        logits_p: Tensor,
        means_p: Tensor,
        precisions_p: Tensor,
        logits_d: Tensor,
        means_d: Tensor,
        precisions_d: Tensor,
    ):
        """Return the component weights (i.e. logits) of the proposal posterior.

        Args:
            means_pp: Means of the proposal posterior.
            precisions_pp: Precision matrices of the proposal posterior.
            covariances_pp: Covariance matrices of the proposal posterior.
            logits_p: Component weights (i.e. logits) of the proposal distribution.
            means_p: Means of the proposal distribution.
            precisions_p: Precision matrices of the proposal distribution.
            logits_d: Component weights (i.e. logits) of the density estimator.
            means_d: Means of the density estimator.
            precisions_d: Precision matrices of the density estimator.

        Returns: Component weights of the proposal posterior. L*K terms.
        """

        num_comps_p = precisions_p.shape[1]
        num_comps_d = precisions_d.shape[1]

        # Compute log(alpha_i * beta_j)
        logits_p_rep = logits_p.repeat_interleave(num_comps_d, dim=1)
        logits_d_rep = logits_d.repeat(1, num_comps_p)
        logit_factors = logits_p_rep + logits_d_rep

        # Compute sqrt(det()/(det()*det()))
        logdet_covariances_pp = torch.logdet(covariances_pp)
        logdet_covariances_p = -torch.logdet(precisions_p)
        logdet_covariances_d = -torch.logdet(precisions_d)

        # Repeat the proposal and density estimator terms such that there are LK terms.
        # Same trick as has been used above.
        logdet_covariances_p_rep = logdet_covariances_p.repeat_interleave(
            num_comps_d, dim=1
        )
        logdet_covariances_d_rep = logdet_covariances_d.repeat(1, num_comps_p)

        log_sqrt_det_ratio = 0.5 * (
            logdet_covariances_pp
            - (logdet_covariances_p_rep + logdet_covariances_d_rep)
        )

        # Compute for proposal, density estimator, and proposal posterior:
        # mu_i.T * P_i * mu_i
        exponent_p = batched_mixture_vmv(precisions_p, means_p)
        exponent_d = batched_mixture_vmv(precisions_d, means_d)
        exponent_pp = batched_mixture_vmv(precisions_pp, means_pp)

        # Extend proposal and density estimator exponents to get LK terms.
        exponent_p_rep = exponent_p.repeat_interleave(num_comps_d, dim=1)
        exponent_d_rep = exponent_d.repeat(1, num_comps_p)
        exponent = -0.5 * (exponent_p_rep + exponent_d_rep - exponent_pp)

        logits_pp = logit_factors + log_sqrt_det_ratio + exponent

        return logits_pp

    def _maybe_z_score_theta(self, theta: Tensor) -> Tensor:
        """Return potentially standardized theta if z-scoring was requested."""

        if self.z_score_theta:
            theta, _ = self._neural_net.net._transform(theta)

        return theta

__init__(prior=None, density_estimator='maf', device='cpu', logging_level='WARNING', summary_writer=None, show_progress_bars=True)

NPE-C / APT [1].

[1] Automatic Posterior Transformation for Likelihood-free Inference, Greenberg et al., ICML 2019, https://arxiv.org/abs/1905.07488.

Like all NPE methods, this method trains a deep neural density estimator to directly approximate the posterior. Also like all other NPE methods, in the first round, this density estimator is trained with a maximum-likelihood loss.

For the sequential mode in which the density estimator is trained across rounds, this class implements two loss variants of NPE-C: the non-atomic and the atomic version. The atomic loss of NPE-C can be used for any density estimator, i.e. also for normalizing flows. However, it suffers from leakage issues. On the other hand, the non-atomic loss can only be used only if the proposal distribution is a mixture of Gaussians, the density estimator is a mixture of Gaussians, and the prior is either Gaussian or Uniform. It does not suffer from leakage issues. At the beginning of each round, we print whether the non-atomic or the atomic version is used.

In this codebase, we will automatically switch to the non-atomic loss if the following criteria are fulfilled:
- proposal is a DirectPosterior with density_estimator mdn, as built with sbi.neural_nets.posterior_nn().
- the density estimator is a mdn, as built with sbi.neural_nets.posterior_nn().
- isinstance(prior, MultivariateNormal) (from torch.distributions) or isinstance(prior, sbi.utils.BoxUniform)

Note that custom implementations of any of these densities (or estimators) will not trigger the non-atomic loss, and the algorithm will fall back onto using the atomic loss.

Parameters:

Name Type Description Default
prior Optional[Distribution]

A probability distribution that expresses prior knowledge about the parameters, e.g. which ranges are meaningful for them.

None
density_estimator Union[str, Callable]

If it is a string, use a pre-configured network of the provided type (one of nsf, maf, mdn, made). Alternatively, a function that builds a custom neural network can be provided. The function will be called with the first batch of simulations (theta, x), which can thus be used for shape inference and potentially for z-scoring. It needs to return a PyTorch nn.Module implementing the density estimator. The density estimator needs to provide the methods .log_prob and .sample().

'maf'
device str

Training device, e.g., “cpu”, “cuda” or “cuda:{0, 1, …}”.

'cpu'
logging_level Union[int, str]

Minimum severity of messages to log. One of the strings INFO, WARNING, DEBUG, ERROR and CRITICAL.

'WARNING'
summary_writer Optional[TensorboardSummaryWriter]

A tensorboard SummaryWriter to control, among others, log file location (default is <current working directory>/logs.)

None
show_progress_bars bool

Whether to show a progressbar during training.

True
Source code in sbi/inference/trainers/npe/npe_c.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
def __init__(
    self,
    prior: Optional[Distribution] = None,
    density_estimator: Union[str, Callable] = "maf",
    device: str = "cpu",
    logging_level: Union[int, str] = "WARNING",
    summary_writer: Optional[TensorboardSummaryWriter] = None,
    show_progress_bars: bool = True,
):
    r"""NPE-C / APT [1].

    [1] _Automatic Posterior Transformation for Likelihood-free Inference_,
        Greenberg et al., ICML 2019, https://arxiv.org/abs/1905.07488.

    Like all NPE methods, this method trains a deep neural density estimator to
    directly approximate the posterior. Also like all other NPE methods, in the
    first round, this density estimator is trained with a maximum-likelihood loss.

    For the sequential mode in which the density estimator is trained across rounds,
    this class implements two loss variants of NPE-C: the non-atomic and the atomic
    version. The atomic loss of NPE-C can be used for any density estimator,
    i.e. also for normalizing flows. However, it suffers from leakage issues. On
    the other hand, the non-atomic loss can only be used only if the proposal
    distribution is a mixture of Gaussians, the density estimator is a mixture of
    Gaussians, and the prior is either Gaussian or Uniform. It does not suffer from
    leakage issues. At the beginning of each round, we print whether the non-atomic
    or the atomic version is used.

    In this codebase, we will automatically switch to the non-atomic loss if the
    following criteria are fulfilled:<br/>
    - proposal is a `DirectPosterior` with density_estimator `mdn`, as built
        with `sbi.neural_nets.posterior_nn()`.<br/>
    - the density estimator is a `mdn`, as built with
        `sbi.neural_nets.posterior_nn()`.<br/>
    - `isinstance(prior, MultivariateNormal)` (from `torch.distributions`) or
        `isinstance(prior, sbi.utils.BoxUniform)`

    Note that custom implementations of any of these densities (or estimators) will
    not trigger the non-atomic loss, and the algorithm will fall back onto using
    the atomic loss.

    Args:
        prior: A probability distribution that expresses prior knowledge about the
            parameters, e.g. which ranges are meaningful for them.
        density_estimator: If it is a string, use a pre-configured network of the
            provided type (one of nsf, maf, mdn, made). Alternatively, a function
            that builds a custom neural network can be provided. The function will
            be called with the first batch of simulations (theta, x), which can
            thus be used for shape inference and potentially for z-scoring. It
            needs to return a PyTorch `nn.Module` implementing the density
            estimator. The density estimator needs to provide the methods
            `.log_prob` and `.sample()`.
        device: Training device, e.g., "cpu", "cuda" or "cuda:{0, 1, ...}".
        logging_level: Minimum severity of messages to log. One of the strings
            INFO, WARNING, DEBUG, ERROR and CRITICAL.
        summary_writer: A tensorboard `SummaryWriter` to control, among others, log
            file location (default is `<current working directory>/logs`.)
        show_progress_bars: Whether to show a progressbar during training.
    """

    kwargs = del_entries(locals(), entries=("self", "__class__"))
    super().__init__(**kwargs)

train(num_atoms=10, training_batch_size=200, learning_rate=0.0005, validation_fraction=0.1, stop_after_epochs=20, max_num_epochs=2 ** 31 - 1, clip_max_norm=5.0, calibration_kernel=None, resume_training=False, force_first_round_loss=False, discard_prior_samples=False, use_combined_loss=False, retrain_from_scratch=False, show_train_summary=False, dataloader_kwargs=None)

Return density estimator that approximates the distribution \(p(\theta|x)\).

Parameters:

Name Type Description Default
num_atoms int

Number of atoms to use for classification.

10
training_batch_size int

Training batch size.

200
learning_rate float

Learning rate for Adam optimizer.

0.0005
validation_fraction float

The fraction of data to use for validation.

0.1
stop_after_epochs int

The number of epochs to wait for improvement on the validation set before terminating training.

20
max_num_epochs int

Maximum number of epochs to run. If reached, we stop training even when the validation loss is still decreasing. Otherwise, we train until validation loss increases (see also stop_after_epochs).

2 ** 31 - 1
clip_max_norm Optional[float]

Value at which to clip the total gradient norm in order to prevent exploding gradients. Use None for no clipping.

5.0
calibration_kernel Optional[Callable]

A function to calibrate the loss with respect to the simulations x. See Lueckmann, Gonçalves et al., NeurIPS 2017.

None
resume_training bool

Can be used in case training time is limited, e.g. on a cluster. If True, the split between train and validation set, the optimizer, the number of epochs, and the best validation log-prob will be restored from the last time .train() was called.

False
force_first_round_loss bool

If True, train with maximum likelihood, i.e., potentially ignoring the correction for using a proposal distribution different from the prior.

False
discard_prior_samples bool

Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples.

False
use_combined_loss bool

Whether to train the neural net also on prior samples using maximum likelihood in addition to training it on all samples using atomic loss. The extra MLE loss helps prevent density leaking with bounded priors.

False
retrain_from_scratch bool

Whether to retrain the conditional density estimator for the posterior from scratch each round.

False
show_train_summary bool

Whether to print the number of epochs and validation loss and leakage after the training.

False
dataloader_kwargs Optional[Dict]

Additional or updated kwargs to be passed to the training and validation dataloaders (like, e.g., a collate_fn)

None

Returns:

Type Description
Module

Density estimator that approximates the distribution \(p(\theta|x)\).

Source code in sbi/inference/trainers/npe/npe_c.py
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
def train(
    self,
    num_atoms: int = 10,
    training_batch_size: int = 200,
    learning_rate: float = 5e-4,
    validation_fraction: float = 0.1,
    stop_after_epochs: int = 20,
    max_num_epochs: int = 2**31 - 1,
    clip_max_norm: Optional[float] = 5.0,
    calibration_kernel: Optional[Callable] = None,
    resume_training: bool = False,
    force_first_round_loss: bool = False,
    discard_prior_samples: bool = False,
    use_combined_loss: bool = False,
    retrain_from_scratch: bool = False,
    show_train_summary: bool = False,
    dataloader_kwargs: Optional[Dict] = None,
) -> nn.Module:
    r"""Return density estimator that approximates the distribution $p(\theta|x)$.

    Args:
        num_atoms: Number of atoms to use for classification.
        training_batch_size: Training batch size.
        learning_rate: Learning rate for Adam optimizer.
        validation_fraction: The fraction of data to use for validation.
        stop_after_epochs: The number of epochs to wait for improvement on the
            validation set before terminating training.
        max_num_epochs: Maximum number of epochs to run. If reached, we stop
            training even when the validation loss is still decreasing. Otherwise,
            we train until validation loss increases (see also `stop_after_epochs`).
        clip_max_norm: Value at which to clip the total gradient norm in order to
            prevent exploding gradients. Use None for no clipping.
        calibration_kernel: A function to calibrate the loss with respect to the
            simulations `x`. See Lueckmann, Gonçalves et al., NeurIPS 2017.
        resume_training: Can be used in case training time is limited, e.g. on a
            cluster. If `True`, the split between train and validation set, the
            optimizer, the number of epochs, and the best validation log-prob will
            be restored from the last time `.train()` was called.
        force_first_round_loss: If `True`, train with maximum likelihood,
            i.e., potentially ignoring the correction for using a proposal
            distribution different from the prior.
        discard_prior_samples: Whether to discard samples simulated in round 1, i.e.
            from the prior. Training may be sped up by ignoring such less targeted
            samples.
        use_combined_loss: Whether to train the neural net also on prior samples
            using maximum likelihood in addition to training it on all samples using
            atomic loss. The extra MLE loss helps prevent density leaking with
            bounded priors.
        retrain_from_scratch: Whether to retrain the conditional density
            estimator for the posterior from scratch each round.
        show_train_summary: Whether to print the number of epochs and validation
            loss and leakage after the training.
        dataloader_kwargs: Additional or updated kwargs to be passed to the training
            and validation dataloaders (like, e.g., a collate_fn)

    Returns:
        Density estimator that approximates the distribution $p(\theta|x)$.
    """

    # WARNING: sneaky trick ahead. We proxy the parent's `train` here,
    # requiring the signature to have `num_atoms`, save it for use below, and
    # continue. It's sneaky because we are using the object (self) as a namespace
    # to pass arguments between functions, and that's implicit state management.
    self._num_atoms = num_atoms
    self._use_combined_loss = use_combined_loss
    kwargs = del_entries(
        locals(),
        entries=("self", "__class__", "num_atoms", "use_combined_loss"),
    )

    self._round = max(self._data_round_index)

    if self._round > 0:
        # Set the proposal to the last proposal that was passed by the user. For
        # atomic SNPE, it does not matter what the proposal is. For non-atomic
        # SNPE, we only use the latest data that was passed, i.e. the one from the
        # last proposal.
        proposal = self._proposal_roundwise[-1]
        self.use_non_atomic_loss = (
            isinstance(proposal, DirectPosterior)
            and isinstance(proposal.posterior_estimator.net._distribution, mdn)
            and isinstance(self._neural_net.net._distribution, mdn)
            and check_dist_class(
                self._prior, class_to_check=(Uniform, MultivariateNormal)
            )[0]
        )

        algorithm = "non-atomic" if self.use_non_atomic_loss else "atomic"
        print(f"Using SNPE-C with {algorithm} loss")

        if self.use_non_atomic_loss:
            # Take care of z-scoring, pre-compute and store prior terms.
            self._set_state_for_mog_proposal()

    return super().train(**kwargs)

FMPE

Bases: NeuralInference

Implements the Flow Matching Posterior Estimator (FMPE) for simulation-based inference.

Source code in sbi/inference/trainers/fmpe/fmpe.py
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
class FMPE(NeuralInference):
    """Implements the Flow Matching Posterior Estimator (FMPE) for
    simulation-based inference.
    """

    def __init__(
        self,
        prior: Optional[Distribution],
        density_estimator: Union[str, Callable] = "mlp",
        device: str = "cpu",
        logging_level: Union[int, str] = "WARNING",
        summary_writer: Optional[SummaryWriter] = None,
        show_progress_bars: bool = True,
    ) -> None:
        """Initialization method for the FMPE class.

        Args:
            prior: Prior distribution.
            density_estimator: Neural network architecture used to learn the vector
                field for flow matching. Can be a string, e.g., 'mlp' or 'resnet', or a
                `Callable` that builds a corresponding neural network.
            device: Device to use for training.
            logging_level: Logging level.
            summary_writer: Summary writer for tensorboard.
            show_progress_bars: Whether to show progress bars.
        """
        # obtain the shape of the prior samples
        if isinstance(density_estimator, str):
            self._build_neural_net = flowmatching_nn(model=density_estimator)
        else:
            self._build_neural_net = density_estimator

        super().__init__(
            prior=prior,
            device=device,
            logging_level=logging_level,
            summary_writer=summary_writer,
            show_progress_bars=show_progress_bars,
        )

    def append_simulations(
        self,
        theta: torch.Tensor,
        x: torch.Tensor,
        proposal: Optional[DirectPosterior] = None,
        exclude_invalid_x: Optional[bool] = None,
        data_device: Optional[str] = None,
    ) -> NeuralInference:
        if (
            proposal is None
            or proposal is self._prior
            or (
                isinstance(proposal, RestrictedPrior) and proposal._prior is self._prior
            )
        ):
            current_round = 0
        else:
            raise NotImplementedError(
                "Sequential FMPE with proposal different from prior is not implemented."
            )

        if exclude_invalid_x is None:
            exclude_invalid_x = current_round == 0

        if data_device is None:
            data_device = self._device

        theta, x = validate_theta_and_x(
            theta, x, data_device=data_device, training_device=self._device
        )

        is_valid_x, num_nans, num_infs = handle_invalid_x(
            x, exclude_invalid_x=exclude_invalid_x
        )

        x = x[is_valid_x]
        theta = theta[is_valid_x]

        # Check for problematic z-scoring
        warn_if_zscoring_changes_data(x)
        # Check whether there are NaNs or Infs in the data and remove accordingly.
        npe_msg_on_invalid_x(
            num_nans=num_nans,
            num_infs=num_infs,
            exclude_invalid_x=exclude_invalid_x,
            algorithm="Single-round FMPE",
        )

        self._data_round_index.append(current_round)
        prior_masks = mask_sims_from_prior(int(current_round > 0), theta.size(0))

        self._theta_roundwise.append(theta)
        self._x_roundwise.append(x)
        self._prior_masks.append(prior_masks)

        return self

    def train(
        self,
        training_batch_size: int = 200,
        learning_rate: float = 5e-4,
        validation_fraction: float = 0.1,
        stop_after_epochs: int = 20,
        max_num_epochs: int = 2**31 - 1,
        clip_max_norm: Optional[float] = 5.0,
        resume_training: bool = False,
        force_first_round_loss: bool = False,
        show_train_summary: bool = False,
        dataloader_kwargs: Optional[dict] = None,
    ) -> ConditionalDensityEstimator:
        """Train the flow matching estimator.

        Args:
            training_batch_size: Batch size for training. Defaults to 50.
            learning_rate: Learning rate for training. Defaults to 5e-4.
            validation_fraction: Fraction of the data to use for validation.
            stop_after_epochs: Number of epochs to train for. Defaults to 20.
            max_num_epochs: Maximum number of epochs to train for.
            clip_max_norm: Maximum norm for gradient clipping. Defaults to 5.0.
            resume_training: Can be used in case training time is limited, e.g. on a
                cluster. If `True`, the split between train and validation set, the
                optimizer, the number of epochs, and the best validation log-prob will
                be restored from the last time `.train()` was called.
            force_first_round_loss: Whether to allow training with
                simulations that have not been sampled from the prior, e.g., in a
                sequential inference setting. Note that can lead to biased inference
                results.
            show_train_summary: Whether to show the training summary. Defaults to False.
            dataloader_kwargs: Additional keyword arguments for the dataloader.

        Returns:
            DensityEstimator: Trained flow matching estimator.
        """

        # Load data from most recent round.
        self._round = max(self._data_round_index)

        if self._round == 0 and self._neural_net is not None:
            assert force_first_round_loss or resume_training, (
                "You have already trained this neural network. After you had trained "
                "the network, you again appended simulations with `append_simulations"
                "(theta, x)`, but you did not provide a proposal. If the new "
                "simulations are sampled from the prior, you can set "
                "`.train(..., force_first_round_loss=True`). However, if the new "
                "simulations were not sampled from the prior, you should pass the "
                "proposal, i.e. `append_simulations(theta, x, proposal)`. If "
                "your samples are not sampled from the prior and you do not pass a "
                "proposal and you set `force_first_round_loss=True`, the result of "
                "FMPE will not be the true posterior. Instead, it will be the proposal "
                "posterior, which (usually) is more narrow than the true posterior."
            )

        start_idx = 0  # as there is no multi-round FMPE yet

        train_loader, val_loader = self.get_dataloaders(
            start_idx,
            training_batch_size,
            validation_fraction,
            resume_training=resume_training,
            dataloader_kwargs=dataloader_kwargs,
        )

        if self._neural_net is None:
            # Get theta, x to initialize NN
            theta, x, _ = self.get_simulations(starting_round=start_idx)

            # Use only training data for building the neural net (z-scoring transforms)
            self._neural_net = self._build_neural_net(
                theta[self.train_indices].to("cpu"),
                x[self.train_indices].to("cpu"),
            )

            del theta, x

        # Move entire net to device for training.
        self._neural_net.to(self._device)

        # initialize optimizer and training parameters
        if not resume_training:
            self.optimizer = Adam(
                list(self._neural_net.net.parameters()), lr=learning_rate
            )
            self.epoch = 0
            # NOTE: in the FMPE context we use MSE loss, not log probs.
            self._val_loss = float("Inf")

        while self.epoch <= max_num_epochs and not self._converged(
            self.epoch, stop_after_epochs
        ):
            self._neural_net.net.train()
            train_loss_sum = 0
            epoch_start_time = time.time()
            for batch in train_loader:
                self.optimizer.zero_grad()
                # get batches on current device.
                theta_batch, x_batch = (
                    batch[0].to(self._device),
                    batch[1].to(self._device),
                )

                train_loss = self._neural_net.loss(theta_batch, x_batch).mean()
                train_loss_sum += train_loss.item()

                train_loss.backward()
                if clip_max_norm is not None:
                    clip_grad_norm_(
                        self._neural_net.net.parameters(), max_norm=clip_max_norm
                    )
                self.optimizer.step()

            self.epoch += 1

            train_loss_average = train_loss_sum / len(train_loader)  # type: ignore
            self._summary["training_loss"].append(train_loss_average)

            # Calculate validation performance.
            self._neural_net.eval()
            val_loss_sum = 0

            with torch.no_grad():
                for batch in val_loader:
                    theta_batch, x_batch = (
                        batch[0].to(self._device),
                        batch[1].to(self._device),
                    )
                    # Aggregate the validation losses.
                    val_losses = self._neural_net.loss(theta_batch, x_batch)
                    val_loss_sum += val_losses.sum().item()

            # Take mean over all validation samples.
            self._val_loss = val_loss_sum / (
                len(val_loader) * val_loader.batch_size  # type: ignore
            )
            # Log validation loss for every epoch.
            self._summary["validation_loss"].append(self._val_loss)
            self._summary["epoch_durations_sec"].append(time.time() - epoch_start_time)

            self._maybe_show_progress(self._show_progress_bars, self.epoch)

        self._report_convergence_at_end(self.epoch, stop_after_epochs, max_num_epochs)

        # Update summary.
        self._summary["epochs_trained"].append(self.epoch)
        self._summary["best_validation_loss"].append(self._best_val_loss)

        # Update tensorboard and summary dict.
        self._summarize(round_=self._round)

        # Update description for progress bar.
        if show_train_summary:
            print(self._describe_round(self._round, self._summary))

        # Avoid keeping the gradients in the resulting network, which can
        # cause memory leakage when benchmarking.
        self._neural_net.zero_grad(set_to_none=True)

        return deepcopy(self._neural_net)

    def build_posterior(
        self,
        density_estimator: Optional[ConditionalDensityEstimator] = None,
        prior: Optional[Distribution] = None,
        sample_with: str = "direct",
        direct_sampling_parameters: Optional[Dict[str, Any]] = None,
        **kwargs,
    ) -> DirectPosterior:
        """Build the posterior distribution.

        Args:
            density_estimator: Density estimator for the posterior.
            prior: Prior distribution.
            sample_with: Sampling method, currently only "direct" is supported.
            direct_sampling_parameters: kwargs for DirectPosterior.

        Returns:
            DirectPosterior: Posterior distribution.
        """
        if sample_with != "direct":
            raise NotImplementedError(
                "Currently, only direct sampling is supported for FMPE."
            )

        if prior is None:
            assert self._prior is not None, (
                "You did not pass a prior. You have to pass the prior either at "
                "initialization `inference = SNPE(prior)` or to "
                "`.build_posterior(prior=prior)`."
            )
            prior = self._prior
        else:
            utils.check_prior(prior)

        if density_estimator is None:
            posterior_estimator = self._neural_net
            # If internal net is used device is defined.
            device = self._device
        else:
            posterior_estimator = density_estimator
            # Otherwise, infer it from the device of the net parameters.
            device = next(density_estimator.parameters()).device.type

        self._posterior = DirectPosterior(
            posterior_estimator=posterior_estimator,  # type: ignore
            prior=prior,
            device=device,
            **direct_sampling_parameters or {},
        )

        return deepcopy(self._posterior)

__init__(prior, density_estimator='mlp', device='cpu', logging_level='WARNING', summary_writer=None, show_progress_bars=True)

Initialization method for the FMPE class.

Parameters:

Name Type Description Default
prior Optional[Distribution]

Prior distribution.

required
density_estimator Union[str, Callable]

Neural network architecture used to learn the vector field for flow matching. Can be a string, e.g., ‘mlp’ or ‘resnet’, or a Callable that builds a corresponding neural network.

'mlp'
device str

Device to use for training.

'cpu'
logging_level Union[int, str]

Logging level.

'WARNING'
summary_writer Optional[SummaryWriter]

Summary writer for tensorboard.

None
show_progress_bars bool

Whether to show progress bars.

True
Source code in sbi/inference/trainers/fmpe/fmpe.py
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
def __init__(
    self,
    prior: Optional[Distribution],
    density_estimator: Union[str, Callable] = "mlp",
    device: str = "cpu",
    logging_level: Union[int, str] = "WARNING",
    summary_writer: Optional[SummaryWriter] = None,
    show_progress_bars: bool = True,
) -> None:
    """Initialization method for the FMPE class.

    Args:
        prior: Prior distribution.
        density_estimator: Neural network architecture used to learn the vector
            field for flow matching. Can be a string, e.g., 'mlp' or 'resnet', or a
            `Callable` that builds a corresponding neural network.
        device: Device to use for training.
        logging_level: Logging level.
        summary_writer: Summary writer for tensorboard.
        show_progress_bars: Whether to show progress bars.
    """
    # obtain the shape of the prior samples
    if isinstance(density_estimator, str):
        self._build_neural_net = flowmatching_nn(model=density_estimator)
    else:
        self._build_neural_net = density_estimator

    super().__init__(
        prior=prior,
        device=device,
        logging_level=logging_level,
        summary_writer=summary_writer,
        show_progress_bars=show_progress_bars,
    )

build_posterior(density_estimator=None, prior=None, sample_with='direct', direct_sampling_parameters=None, **kwargs)

Build the posterior distribution.

Parameters:

Name Type Description Default
density_estimator Optional[ConditionalDensityEstimator]

Density estimator for the posterior.

None
prior Optional[Distribution]

Prior distribution.

None
sample_with str

Sampling method, currently only “direct” is supported.

'direct'
direct_sampling_parameters Optional[Dict[str, Any]]

kwargs for DirectPosterior.

None

Returns:

Name Type Description
DirectPosterior DirectPosterior

Posterior distribution.

Source code in sbi/inference/trainers/fmpe/fmpe.py
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
def build_posterior(
    self,
    density_estimator: Optional[ConditionalDensityEstimator] = None,
    prior: Optional[Distribution] = None,
    sample_with: str = "direct",
    direct_sampling_parameters: Optional[Dict[str, Any]] = None,
    **kwargs,
) -> DirectPosterior:
    """Build the posterior distribution.

    Args:
        density_estimator: Density estimator for the posterior.
        prior: Prior distribution.
        sample_with: Sampling method, currently only "direct" is supported.
        direct_sampling_parameters: kwargs for DirectPosterior.

    Returns:
        DirectPosterior: Posterior distribution.
    """
    if sample_with != "direct":
        raise NotImplementedError(
            "Currently, only direct sampling is supported for FMPE."
        )

    if prior is None:
        assert self._prior is not None, (
            "You did not pass a prior. You have to pass the prior either at "
            "initialization `inference = SNPE(prior)` or to "
            "`.build_posterior(prior=prior)`."
        )
        prior = self._prior
    else:
        utils.check_prior(prior)

    if density_estimator is None:
        posterior_estimator = self._neural_net
        # If internal net is used device is defined.
        device = self._device
    else:
        posterior_estimator = density_estimator
        # Otherwise, infer it from the device of the net parameters.
        device = next(density_estimator.parameters()).device.type

    self._posterior = DirectPosterior(
        posterior_estimator=posterior_estimator,  # type: ignore
        prior=prior,
        device=device,
        **direct_sampling_parameters or {},
    )

    return deepcopy(self._posterior)

train(training_batch_size=200, learning_rate=0.0005, validation_fraction=0.1, stop_after_epochs=20, max_num_epochs=2 ** 31 - 1, clip_max_norm=5.0, resume_training=False, force_first_round_loss=False, show_train_summary=False, dataloader_kwargs=None)

Train the flow matching estimator.

Parameters:

Name Type Description Default
training_batch_size int

Batch size for training. Defaults to 50.

200
learning_rate float

Learning rate for training. Defaults to 5e-4.

0.0005
validation_fraction float

Fraction of the data to use for validation.

0.1
stop_after_epochs int

Number of epochs to train for. Defaults to 20.

20
max_num_epochs int

Maximum number of epochs to train for.

2 ** 31 - 1
clip_max_norm Optional[float]

Maximum norm for gradient clipping. Defaults to 5.0.

5.0
resume_training bool

Can be used in case training time is limited, e.g. on a cluster. If True, the split between train and validation set, the optimizer, the number of epochs, and the best validation log-prob will be restored from the last time .train() was called.

False
force_first_round_loss bool

Whether to allow training with simulations that have not been sampled from the prior, e.g., in a sequential inference setting. Note that can lead to biased inference results.

False
show_train_summary bool

Whether to show the training summary. Defaults to False.

False
dataloader_kwargs Optional[dict]

Additional keyword arguments for the dataloader.

None

Returns:

Name Type Description
DensityEstimator ConditionalDensityEstimator

Trained flow matching estimator.

Source code in sbi/inference/trainers/fmpe/fmpe.py
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
def train(
    self,
    training_batch_size: int = 200,
    learning_rate: float = 5e-4,
    validation_fraction: float = 0.1,
    stop_after_epochs: int = 20,
    max_num_epochs: int = 2**31 - 1,
    clip_max_norm: Optional[float] = 5.0,
    resume_training: bool = False,
    force_first_round_loss: bool = False,
    show_train_summary: bool = False,
    dataloader_kwargs: Optional[dict] = None,
) -> ConditionalDensityEstimator:
    """Train the flow matching estimator.

    Args:
        training_batch_size: Batch size for training. Defaults to 50.
        learning_rate: Learning rate for training. Defaults to 5e-4.
        validation_fraction: Fraction of the data to use for validation.
        stop_after_epochs: Number of epochs to train for. Defaults to 20.
        max_num_epochs: Maximum number of epochs to train for.
        clip_max_norm: Maximum norm for gradient clipping. Defaults to 5.0.
        resume_training: Can be used in case training time is limited, e.g. on a
            cluster. If `True`, the split between train and validation set, the
            optimizer, the number of epochs, and the best validation log-prob will
            be restored from the last time `.train()` was called.
        force_first_round_loss: Whether to allow training with
            simulations that have not been sampled from the prior, e.g., in a
            sequential inference setting. Note that can lead to biased inference
            results.
        show_train_summary: Whether to show the training summary. Defaults to False.
        dataloader_kwargs: Additional keyword arguments for the dataloader.

    Returns:
        DensityEstimator: Trained flow matching estimator.
    """

    # Load data from most recent round.
    self._round = max(self._data_round_index)

    if self._round == 0 and self._neural_net is not None:
        assert force_first_round_loss or resume_training, (
            "You have already trained this neural network. After you had trained "
            "the network, you again appended simulations with `append_simulations"
            "(theta, x)`, but you did not provide a proposal. If the new "
            "simulations are sampled from the prior, you can set "
            "`.train(..., force_first_round_loss=True`). However, if the new "
            "simulations were not sampled from the prior, you should pass the "
            "proposal, i.e. `append_simulations(theta, x, proposal)`. If "
            "your samples are not sampled from the prior and you do not pass a "
            "proposal and you set `force_first_round_loss=True`, the result of "
            "FMPE will not be the true posterior. Instead, it will be the proposal "
            "posterior, which (usually) is more narrow than the true posterior."
        )

    start_idx = 0  # as there is no multi-round FMPE yet

    train_loader, val_loader = self.get_dataloaders(
        start_idx,
        training_batch_size,
        validation_fraction,
        resume_training=resume_training,
        dataloader_kwargs=dataloader_kwargs,
    )

    if self._neural_net is None:
        # Get theta, x to initialize NN
        theta, x, _ = self.get_simulations(starting_round=start_idx)

        # Use only training data for building the neural net (z-scoring transforms)
        self._neural_net = self._build_neural_net(
            theta[self.train_indices].to("cpu"),
            x[self.train_indices].to("cpu"),
        )

        del theta, x

    # Move entire net to device for training.
    self._neural_net.to(self._device)

    # initialize optimizer and training parameters
    if not resume_training:
        self.optimizer = Adam(
            list(self._neural_net.net.parameters()), lr=learning_rate
        )
        self.epoch = 0
        # NOTE: in the FMPE context we use MSE loss, not log probs.
        self._val_loss = float("Inf")

    while self.epoch <= max_num_epochs and not self._converged(
        self.epoch, stop_after_epochs
    ):
        self._neural_net.net.train()
        train_loss_sum = 0
        epoch_start_time = time.time()
        for batch in train_loader:
            self.optimizer.zero_grad()
            # get batches on current device.
            theta_batch, x_batch = (
                batch[0].to(self._device),
                batch[1].to(self._device),
            )

            train_loss = self._neural_net.loss(theta_batch, x_batch).mean()
            train_loss_sum += train_loss.item()

            train_loss.backward()
            if clip_max_norm is not None:
                clip_grad_norm_(
                    self._neural_net.net.parameters(), max_norm=clip_max_norm
                )
            self.optimizer.step()

        self.epoch += 1

        train_loss_average = train_loss_sum / len(train_loader)  # type: ignore
        self._summary["training_loss"].append(train_loss_average)

        # Calculate validation performance.
        self._neural_net.eval()
        val_loss_sum = 0

        with torch.no_grad():
            for batch in val_loader:
                theta_batch, x_batch = (
                    batch[0].to(self._device),
                    batch[1].to(self._device),
                )
                # Aggregate the validation losses.
                val_losses = self._neural_net.loss(theta_batch, x_batch)
                val_loss_sum += val_losses.sum().item()

        # Take mean over all validation samples.
        self._val_loss = val_loss_sum / (
            len(val_loader) * val_loader.batch_size  # type: ignore
        )
        # Log validation loss for every epoch.
        self._summary["validation_loss"].append(self._val_loss)
        self._summary["epoch_durations_sec"].append(time.time() - epoch_start_time)

        self._maybe_show_progress(self._show_progress_bars, self.epoch)

    self._report_convergence_at_end(self.epoch, stop_after_epochs, max_num_epochs)

    # Update summary.
    self._summary["epochs_trained"].append(self.epoch)
    self._summary["best_validation_loss"].append(self._best_val_loss)

    # Update tensorboard and summary dict.
    self._summarize(round_=self._round)

    # Update description for progress bar.
    if show_train_summary:
        print(self._describe_round(self._round, self._summary))

    # Avoid keeping the gradients in the resulting network, which can
    # cause memory leakage when benchmarking.
    self._neural_net.zero_grad(set_to_none=True)

    return deepcopy(self._neural_net)

NPSE

Bases: NeuralInference

Source code in sbi/inference/trainers/npse/npse.py
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
class NPSE(NeuralInference):
    def __init__(
        self,
        prior: Optional[Distribution] = None,
        score_estimator: Union[str, Callable] = "mlp",
        sde_type: str = "ve",
        device: str = "cpu",
        logging_level: Union[int, str] = "WARNING",
        summary_writer: Optional[SummaryWriter] = None,
        show_progress_bars: bool = True,
        **kwargs,
    ):
        """Base class for Neural Posterior Score Estimation methods.

        Instead of performing conditonal *density* estimation, NPSE methods perform
        conditional *score* estimation i.e. they estimate the gradient of the log
        density using denoising score matching loss.

        NOTE: NPSE does not support multi-round inference with flexible proposals yet.
        You can try to run multi-round with truncated proposals, but note that this is
        not tested yet.

        Args:
            prior: Prior distribution.
            score_estimator: Neural network architecture for the score estimator. Can be
                a string (e.g. 'mlp' or 'ada_mlp') or a callable that returns a neural
                network.
            sde_type: Type of SDE to use. Must be one of ['vp', 've', 'subvp'].
            device: Device to run the training on.
            logging_level: Logging level for the training. Can be an integer or a
                string.
            summary_writer: Tensorboard summary writer.
            show_progress_bars: Whether to show progress bars during training.
            kwargs: Additional keyword arguments.

        References:
            - Geffner, Tomas, George Papamakarios, and Andriy Mnih. "Score modeling for
              simulation-based inference." ICML 2023.
            - Sharrock, Louis, et al. "Sequential neural score estimation: Likelihood-
              free inference with conditional score based diffusion models." ICML 2024.
        """

        super().__init__(
            prior=prior,
            device=device,
            logging_level=logging_level,
            summary_writer=summary_writer,
            show_progress_bars=show_progress_bars,
        )

        # As detailed in the docstring, `score_estimator` is either a string or
        # a callable. The function creating the neural network is attached to
        # `_build_neural_net`. It will be called in the first round and receive
        # thetas and xs as inputs, so that they can be used for shape inference and
        # potentially for z-scoring.
        check_estimator_arg(score_estimator)
        if isinstance(score_estimator, str):
            self._build_neural_net = posterior_score_nn(
                sde_type=sde_type, score_net_type=score_estimator, **kwargs
            )
        else:
            self._build_neural_net = score_estimator

        self._proposal_roundwise = []

    def append_simulations(
        self,
        theta: Tensor,
        x: Tensor,
        proposal: Optional[DirectPosterior] = None,
        exclude_invalid_x: Optional[bool] = None,
        data_device: Optional[str] = None,
    ) -> "NPSE":
        r"""Store parameters and simulation outputs to use them for later training.

        Data are stored as entries in lists for each type of variable (parameter/data).

        Stores $\theta$, $x$, prior_masks (indicating if simulations are coming from the
        prior or not) and an index indicating which round the batch of simulations came
        from.

        Args:
            theta: Parameter sets.
            x: Simulation outputs.
            proposal: The distribution that the parameters $\theta$ were sampled from.
                Pass `None` if the parameters were sampled from the prior. If not
                `None`, it will trigger a different loss-function.
            exclude_invalid_x: Whether invalid simulations are discarded during
                training. For single-round SNPE, it is fine to discard invalid
                simulations, but for multi-round SNPE (atomic), discarding invalid
                simulations gives systematically wrong results. If `None`, it will
                be `True` in the first round and `False` in later rounds.
            data_device: Where to store the data, default is on the same device where
                the training is happening. If training a large dataset on a GPU with not
                much VRAM can set to 'cpu' to store data on system memory instead.

        Returns:
            NeuralInference object (returned so that this function is chainable).
        """
        assert (
            proposal is None
        ), "Multi-round NPSE is not yet implemented. Please use single-round NPSE."
        current_round = 0

        if exclude_invalid_x is None:
            exclude_invalid_x = current_round == 0

        if data_device is None:
            data_device = self._device

        theta, x = validate_theta_and_x(
            theta, x, data_device=data_device, training_device=self._device
        )

        is_valid_x, num_nans, num_infs = handle_invalid_x(
            x, exclude_invalid_x=exclude_invalid_x
        )

        x = x[is_valid_x]
        theta = theta[is_valid_x]

        # Check for problematic z-scoring
        warn_if_zscoring_changes_data(x)

        npe_msg_on_invalid_x(num_nans, num_infs, exclude_invalid_x, "Single-round NPE")

        self._data_round_index.append(current_round)
        prior_masks = mask_sims_from_prior(int(current_round > 0), theta.size(0))

        self._theta_roundwise.append(theta)
        self._x_roundwise.append(x)
        self._prior_masks.append(prior_masks)

        self._proposal_roundwise.append(proposal)

        if self._prior is None or isinstance(self._prior, ImproperEmpirical):
            theta_prior = self.get_simulations()[0].to(self._device)
            self._prior = ImproperEmpirical(
                theta_prior, ones(theta_prior.shape[0], device=self._device)
            )

        return self

    def train(
        self,
        training_batch_size: int = 200,
        learning_rate: float = 5e-4,
        validation_fraction: float = 0.1,
        stop_after_epochs: int = 200,
        max_num_epochs: int = 2**31 - 1,
        clip_max_norm: Optional[float] = 5.0,
        calibration_kernel: Optional[Callable] = None,
        ema_loss_decay: float = 0.1,
        resume_training: bool = False,
        force_first_round_loss: bool = False,
        discard_prior_samples: bool = False,
        retrain_from_scratch: bool = False,
        show_train_summary: bool = False,
        dataloader_kwargs: Optional[dict] = None,
    ) -> ConditionalScoreEstimator:
        r"""Returns a score estimator that approximates the score
        $\nabla_\theta \log p(\theta|x)$.

        Args:
            training_batch_size: Training batch size.
            learning_rate: Learning rate for Adam optimizer.
            validation_fraction: The fraction of data to use for validation.
            stop_after_epochs: The number of epochs to wait for improvement on the
                validation set before terminating training.
            max_num_epochs: Maximum number of epochs to run. If reached, we stop
                training even when the validation loss is still decreasing. Otherwise,
                we train until validation loss increases (see also `stop_after_epochs`).
            clip_max_norm: Value at which to clip the total gradient norm in order to
                prevent exploding gradients. Use None for no clipping.
            calibration_kernel: A function to calibrate the loss with respect
                to the simulations `x` (optional). See Lueckmann, Gonçalves et al.,
                NeurIPS 2017. If `None`, no calibration is used.
            resume_training: Can be used in case training time is limited, e.g. on a
                cluster. If `True`, the split between train and validation set, the
                optimizer, the number of epochs, and the best validation log-prob will
                be restored from the last time `.train()` was called.
            force_first_round_loss: If `True`, train with maximum likelihood,
                i.e., potentially ignoring the correction for using a proposal
                distribution different from the prior.
            discard_prior_samples: Whether to discard samples simulated in round 1, i.e.
                from the prior. Training may be sped up by ignoring such less targeted
                samples.
            retrain_from_scratch: Whether to retrain the conditional density
                estimator for the posterior from scratch each round.
            show_train_summary: Whether to print the number of epochs and validation
                loss after the training.
            dataloader_kwargs: Additional or updated kwargs to be passed to the training
                and validation dataloaders (like, e.g., a collate_fn)

        Returns:
            Score estimator that approximates the posterior score.
        """
        # Load data from most recent round.
        self._round = max(self._data_round_index)

        if self._round == 0 and self._neural_net is not None:
            assert force_first_round_loss or resume_training, (
                "You have already trained this neural network. After you had trained "
                "the network, you again appended simulations with `append_simulations"
                "(theta, x)`, but you did not provide a proposal. If the new "
                "simulations are sampled from the prior, you can set "
                "`.train(..., force_first_round_loss=True`). However, if the new "
                "simulations were not sampled from the prior, you should pass the "
                "proposal, i.e. `append_simulations(theta, x, proposal)`. If "
                "your samples are not sampled from the prior and you do not pass a "
                "proposal and you set `force_first_round_loss=True`, the result of "
                "NPSE will not be the true posterior. Instead, it will be the proposal "
                "posterior, which (usually) is more narrow than the true posterior."
            )

        # Calibration kernels proposed in Lueckmann, Gonçalves et al., 2017.
        if calibration_kernel is None:

            def default_calibration_kernel(x):
                return ones([len(x)], device=self._device)

            calibration_kernel = default_calibration_kernel

        # Starting index for the training set (1 = discard round-0 samples).
        start_idx = int(discard_prior_samples and self._round > 0)

        # Set the proposal to the last proposal that was passed by the user. For
        # atomic SNPE, it does not matter what the proposal is. For non-atomic
        # SNPE, we only use the latest data that was passed, i.e. the one from the
        # last proposal.
        proposal = self._proposal_roundwise[-1]

        train_loader, val_loader = self.get_dataloaders(
            start_idx,
            training_batch_size,
            validation_fraction,
            resume_training,
            dataloader_kwargs=dataloader_kwargs,
        )
        # First round or if retraining from scratch:
        # Call the `self._build_neural_net` with the rounds' thetas and xs as
        # arguments, which will build the neural network.
        if self._neural_net is None or retrain_from_scratch:
            # Get theta,x to initialize NN
            theta, x, _ = self.get_simulations(starting_round=start_idx)
            # Use only training data for building the neural net (z-scoring transforms)

            self._neural_net = self._build_neural_net(
                theta[self.train_indices].to("cpu"),
                x[self.train_indices].to("cpu"),
            )

            test_posterior_net_for_multi_d_x(
                self._neural_net,
                theta.to("cpu"),
                x.to("cpu"),
            )

            del theta, x

        # Move entire net to device for training.
        self._neural_net.to(self._device)

        if not resume_training:
            self.optimizer = Adam(list(self._neural_net.parameters()), lr=learning_rate)

            self.epoch, self._val_loss = 0, float("Inf")

        while self.epoch <= max_num_epochs and not self._converged(
            self.epoch, stop_after_epochs
        ):
            # Train for a single epoch.
            self._neural_net.train()
            train_loss_sum = 0
            epoch_start_time = time.time()
            for batch in train_loader:
                self.optimizer.zero_grad()
                # Get batches on current device.
                theta_batch, x_batch, masks_batch = (
                    batch[0].to(self._device),
                    batch[1].to(self._device),
                    batch[2].to(self._device),
                )

                train_losses = self._loss(
                    theta_batch,
                    x_batch,
                    masks_batch,
                    proposal,
                    calibration_kernel,
                    force_first_round_loss=force_first_round_loss,
                )

                train_loss = torch.mean(train_losses)

                train_loss_sum += train_losses.sum().item()

                train_loss.backward()
                if clip_max_norm is not None:
                    clip_grad_norm_(
                        self._neural_net.parameters(), max_norm=clip_max_norm
                    )
                self.optimizer.step()

            self.epoch += 1

            train_loss_average = train_loss_sum / (
                len(train_loader) * train_loader.batch_size  # type: ignore
            )

            # NOTE: Due to the inherently noisy nature we do instead log a exponential
            # moving average of the training loss.
            if len(self._summary["training_loss"]) == 0:
                self._summary["training_loss"].append(train_loss_average)
            else:
                previous_loss = self._summary["training_loss"][-1]
                self._summary["training_loss"].append(
                    (1.0 - ema_loss_decay) * previous_loss
                    + ema_loss_decay * train_loss_average
                )

            # Calculate validation performance.
            self._neural_net.eval()
            val_loss_sum = 0

            with torch.no_grad():
                for batch in val_loader:
                    theta_batch, x_batch, masks_batch = (
                        batch[0].to(self._device),
                        batch[1].to(self._device),
                        batch[2].to(self._device),
                    )
                    # Take negative loss here to get validation log_prob.
                    val_losses = self._loss(
                        theta_batch,
                        x_batch,
                        masks_batch,
                        proposal,
                        calibration_kernel,
                        force_first_round_loss=force_first_round_loss,
                    )
                    val_loss_sum += val_losses.sum().item()

            # Take mean over all validation samples.
            val_loss = val_loss_sum / (
                len(val_loader) * val_loader.batch_size  # type: ignore
            )

            # NOTE: Due to the inherently noisy nature we do instead log a exponential
            # moving average of the validation loss.
            if len(self._summary["validation_loss"]) == 0:
                val_loss_ema = val_loss
            else:
                previous_loss = self._summary["validation_loss"][-1]
                val_loss_ema = (
                    1 - ema_loss_decay
                ) * previous_loss + ema_loss_decay * val_loss

            self._val_loss = val_loss_ema
            self._summary["validation_loss"].append(self._val_loss)
            self._summary["epoch_durations_sec"].append(time.time() - epoch_start_time)

            self._maybe_show_progress(self._show_progress_bars, self.epoch)

        self._report_convergence_at_end(self.epoch, stop_after_epochs, max_num_epochs)

        # Update summary.
        self._summary["epochs_trained"].append(self.epoch)
        self._summary["best_validation_loss"].append(self._val_loss)

        # Update tensorboard and summary dict.
        self._summarize(round_=self._round)

        # Update description for progress bar.
        if show_train_summary:
            print(self._describe_round(self._round, self._summary))

        # Avoid keeping the gradients in the resulting network, which can
        # cause memory leakage when benchmarking.
        self._neural_net.zero_grad(set_to_none=True)

        return deepcopy(self._neural_net)

    def build_posterior(
        self,
        score_estimator: Optional[ConditionalScoreEstimator] = None,
        prior: Optional[Distribution] = None,
        sample_with: str = "sde",
    ) -> ScorePosterior:
        r"""Build posterior from the score estimator.

        For NPSE, the posterior distribution that is returned here implements the
        following functionality over the raw neural density estimator:
        - correct the calculation of the log probability such that it compensates for
            the leakage.
        - reject samples that lie outside of the prior bounds.

        Args:
            score_estimator: The score estimator that the posterior is based on.
                If `None`, use the latest neural score estimator that was trained.
            prior: Prior distribution.
            sample_with: Method to use for sampling from the posterior. Can be one of
                'sde' (default) or 'ode'. The 'sde' method uses the score to
                do a Langevin diffusion step, while the 'ode' method uses the score to
                define a probabilistic ODE and solves it with a numerical ODE solver.

        Returns:
            Posterior $p(\theta|x)$  with `.sample()` and `.log_prob()` methods.
        """
        if prior is None:
            assert self._prior is not None, (
                "You did not pass a prior. You have to pass the prior either at "
                "initialization `inference = NPSE(prior)` or to "
                "`.build_posterior(prior=prior)`."
            )
            prior = self._prior
        else:
            utils.check_prior(prior)

        if score_estimator is None:
            score_estimator = self._neural_net
            # If internal net is used device is defined.
            device = self._device
        # Otherwise, infer it from the device of the net parameters.
        else:
            # TODO: Add protocol for checking if the score estimator has forward and
            # loss methods with the correct signature.
            device = str(next(score_estimator.parameters()).device)

        posterior = ScorePosterior(
            score_estimator,  # type: ignore
            prior,
            device=device,
            sample_with=sample_with,
        )

        self._posterior = posterior
        # Store models at end of each round.
        self._model_bank.append(deepcopy(self._posterior))

        return deepcopy(self._posterior)

    def _loss_proposal_posterior(
        self,
        theta: Tensor,
        x: Tensor,
        masks: Tensor,
        proposal: Optional[Any],
    ) -> Tensor:
        raise NotImplementedError("Multi-round NPSE is not yet implemented.")

    def _loss(
        self,
        theta: Tensor,
        x: Tensor,
        masks: Tensor,
        proposal: Optional[Any],
        calibration_kernel: Callable,
        force_first_round_loss: bool = False,
    ) -> Tensor:
        """Return loss from score estimator. Currently only single-round NPSE
         is implemented, i.e., no proposal correction is applied for later rounds.

        The loss is the negative log prob. Irrespective of the round or SNPE method
        (A, B, or C), it can be weighted with a calibration kernel.

        Returns:
            Calibration kernel-weighted negative log prob.
            force_first_round_loss: If `True`, train with maximum likelihood,
                i.e., potentially ignoring the correction for using a proposal
                distribution different from the prior.
        """
        if self._round == 0 or force_first_round_loss:
            # First round loss.
            loss = self._neural_net.loss(theta, x)
        else:
            raise NotImplementedError(
                "Multi-round NPSE with arbitrary proposals is not implemented"
            )

        return calibration_kernel(x) * loss

    def _converged(self, epoch: int, stop_after_epochs: int) -> bool:
        """Check if training has converged.

        Unlike the `._converged` method in base.py, this method does not reset to the
        best model. We noticed that this improves performance. Deleting this method
        will make C2ST tests fail. This is because the loss is very stochastic, so
        resetting might reset to an underfitted model. Ideally, we would write a
        custom `._converged()` method which checks whether the loss is still going
        down **for all t**.

        Args:
            epoch: Current epoch.
            stop_after_epochs: Number of epochs to wait for improvement on the
                validation set before terminating training.

        Returns:
            Whether training has converged.
        """
        converged = False

        # No checkpointing, just check if the validation loss has improved.

        # (Re)-start the epoch count with the first epoch or any improvement.
        if epoch == 0 or self._val_loss < self._best_val_loss:
            self._best_val_loss = self._val_loss
            self._epochs_since_last_improvement = 0
        else:
            self._epochs_since_last_improvement += 1

        # If no validation improvement over many epochs, stop training.
        if self._epochs_since_last_improvement > stop_after_epochs - 1:
            converged = True

        return converged

__init__(prior=None, score_estimator='mlp', sde_type='ve', device='cpu', logging_level='WARNING', summary_writer=None, show_progress_bars=True, **kwargs)

Base class for Neural Posterior Score Estimation methods.

Instead of performing conditonal density estimation, NPSE methods perform conditional score estimation i.e. they estimate the gradient of the log density using denoising score matching loss.

NOTE: NPSE does not support multi-round inference with flexible proposals yet. You can try to run multi-round with truncated proposals, but note that this is not tested yet.

Parameters:

Name Type Description Default
prior Optional[Distribution]

Prior distribution.

None
score_estimator Union[str, Callable]

Neural network architecture for the score estimator. Can be a string (e.g. ‘mlp’ or ‘ada_mlp’) or a callable that returns a neural network.

'mlp'
sde_type str

Type of SDE to use. Must be one of [‘vp’, ‘ve’, ‘subvp’].

've'
device str

Device to run the training on.

'cpu'
logging_level Union[int, str]

Logging level for the training. Can be an integer or a string.

'WARNING'
summary_writer Optional[SummaryWriter]

Tensorboard summary writer.

None
show_progress_bars bool

Whether to show progress bars during training.

True
kwargs

Additional keyword arguments.

{}
References
  • Geffner, Tomas, George Papamakarios, and Andriy Mnih. “Score modeling for simulation-based inference.” ICML 2023.
  • Sharrock, Louis, et al. “Sequential neural score estimation: Likelihood- free inference with conditional score based diffusion models.” ICML 2024.
Source code in sbi/inference/trainers/npse/npse.py
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def __init__(
    self,
    prior: Optional[Distribution] = None,
    score_estimator: Union[str, Callable] = "mlp",
    sde_type: str = "ve",
    device: str = "cpu",
    logging_level: Union[int, str] = "WARNING",
    summary_writer: Optional[SummaryWriter] = None,
    show_progress_bars: bool = True,
    **kwargs,
):
    """Base class for Neural Posterior Score Estimation methods.

    Instead of performing conditonal *density* estimation, NPSE methods perform
    conditional *score* estimation i.e. they estimate the gradient of the log
    density using denoising score matching loss.

    NOTE: NPSE does not support multi-round inference with flexible proposals yet.
    You can try to run multi-round with truncated proposals, but note that this is
    not tested yet.

    Args:
        prior: Prior distribution.
        score_estimator: Neural network architecture for the score estimator. Can be
            a string (e.g. 'mlp' or 'ada_mlp') or a callable that returns a neural
            network.
        sde_type: Type of SDE to use. Must be one of ['vp', 've', 'subvp'].
        device: Device to run the training on.
        logging_level: Logging level for the training. Can be an integer or a
            string.
        summary_writer: Tensorboard summary writer.
        show_progress_bars: Whether to show progress bars during training.
        kwargs: Additional keyword arguments.

    References:
        - Geffner, Tomas, George Papamakarios, and Andriy Mnih. "Score modeling for
          simulation-based inference." ICML 2023.
        - Sharrock, Louis, et al. "Sequential neural score estimation: Likelihood-
          free inference with conditional score based diffusion models." ICML 2024.
    """

    super().__init__(
        prior=prior,
        device=device,
        logging_level=logging_level,
        summary_writer=summary_writer,
        show_progress_bars=show_progress_bars,
    )

    # As detailed in the docstring, `score_estimator` is either a string or
    # a callable. The function creating the neural network is attached to
    # `_build_neural_net`. It will be called in the first round and receive
    # thetas and xs as inputs, so that they can be used for shape inference and
    # potentially for z-scoring.
    check_estimator_arg(score_estimator)
    if isinstance(score_estimator, str):
        self._build_neural_net = posterior_score_nn(
            sde_type=sde_type, score_net_type=score_estimator, **kwargs
        )
    else:
        self._build_neural_net = score_estimator

    self._proposal_roundwise = []

append_simulations(theta, x, proposal=None, exclude_invalid_x=None, data_device=None)

Store parameters and simulation outputs to use them for later training.

Data are stored as entries in lists for each type of variable (parameter/data).

Stores \(\theta\), \(x\), prior_masks (indicating if simulations are coming from the prior or not) and an index indicating which round the batch of simulations came from.

Parameters:

Name Type Description Default
theta Tensor

Parameter sets.

required
x Tensor

Simulation outputs.

required
proposal Optional[DirectPosterior]

The distribution that the parameters \(\theta\) were sampled from. Pass None if the parameters were sampled from the prior. If not None, it will trigger a different loss-function.

None
exclude_invalid_x Optional[bool]

Whether invalid simulations are discarded during training. For single-round SNPE, it is fine to discard invalid simulations, but for multi-round SNPE (atomic), discarding invalid simulations gives systematically wrong results. If None, it will be True in the first round and False in later rounds.

None
data_device Optional[str]

Where to store the data, default is on the same device where the training is happening. If training a large dataset on a GPU with not much VRAM can set to ‘cpu’ to store data on system memory instead.

None

Returns:

Type Description
NPSE

NeuralInference object (returned so that this function is chainable).

Source code in sbi/inference/trainers/npse/npse.py
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
def append_simulations(
    self,
    theta: Tensor,
    x: Tensor,
    proposal: Optional[DirectPosterior] = None,
    exclude_invalid_x: Optional[bool] = None,
    data_device: Optional[str] = None,
) -> "NPSE":
    r"""Store parameters and simulation outputs to use them for later training.

    Data are stored as entries in lists for each type of variable (parameter/data).

    Stores $\theta$, $x$, prior_masks (indicating if simulations are coming from the
    prior or not) and an index indicating which round the batch of simulations came
    from.

    Args:
        theta: Parameter sets.
        x: Simulation outputs.
        proposal: The distribution that the parameters $\theta$ were sampled from.
            Pass `None` if the parameters were sampled from the prior. If not
            `None`, it will trigger a different loss-function.
        exclude_invalid_x: Whether invalid simulations are discarded during
            training. For single-round SNPE, it is fine to discard invalid
            simulations, but for multi-round SNPE (atomic), discarding invalid
            simulations gives systematically wrong results. If `None`, it will
            be `True` in the first round and `False` in later rounds.
        data_device: Where to store the data, default is on the same device where
            the training is happening. If training a large dataset on a GPU with not
            much VRAM can set to 'cpu' to store data on system memory instead.

    Returns:
        NeuralInference object (returned so that this function is chainable).
    """
    assert (
        proposal is None
    ), "Multi-round NPSE is not yet implemented. Please use single-round NPSE."
    current_round = 0

    if exclude_invalid_x is None:
        exclude_invalid_x = current_round == 0

    if data_device is None:
        data_device = self._device

    theta, x = validate_theta_and_x(
        theta, x, data_device=data_device, training_device=self._device
    )

    is_valid_x, num_nans, num_infs = handle_invalid_x(
        x, exclude_invalid_x=exclude_invalid_x
    )

    x = x[is_valid_x]
    theta = theta[is_valid_x]

    # Check for problematic z-scoring
    warn_if_zscoring_changes_data(x)

    npe_msg_on_invalid_x(num_nans, num_infs, exclude_invalid_x, "Single-round NPE")

    self._data_round_index.append(current_round)
    prior_masks = mask_sims_from_prior(int(current_round > 0), theta.size(0))

    self._theta_roundwise.append(theta)
    self._x_roundwise.append(x)
    self._prior_masks.append(prior_masks)

    self._proposal_roundwise.append(proposal)

    if self._prior is None or isinstance(self._prior, ImproperEmpirical):
        theta_prior = self.get_simulations()[0].to(self._device)
        self._prior = ImproperEmpirical(
            theta_prior, ones(theta_prior.shape[0], device=self._device)
        )

    return self

build_posterior(score_estimator=None, prior=None, sample_with='sde')

Build posterior from the score estimator.

For NPSE, the posterior distribution that is returned here implements the following functionality over the raw neural density estimator: - correct the calculation of the log probability such that it compensates for the leakage. - reject samples that lie outside of the prior bounds.

Parameters:

Name Type Description Default
score_estimator Optional[ConditionalScoreEstimator]

The score estimator that the posterior is based on. If None, use the latest neural score estimator that was trained.

None
prior Optional[Distribution]

Prior distribution.

None
sample_with str

Method to use for sampling from the posterior. Can be one of ‘sde’ (default) or ‘ode’. The ‘sde’ method uses the score to do a Langevin diffusion step, while the ‘ode’ method uses the score to define a probabilistic ODE and solves it with a numerical ODE solver.

'sde'

Returns:

Type Description
ScorePosterior

Posterior \(p(\theta|x)\) with .sample() and .log_prob() methods.

Source code in sbi/inference/trainers/npse/npse.py
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
def build_posterior(
    self,
    score_estimator: Optional[ConditionalScoreEstimator] = None,
    prior: Optional[Distribution] = None,
    sample_with: str = "sde",
) -> ScorePosterior:
    r"""Build posterior from the score estimator.

    For NPSE, the posterior distribution that is returned here implements the
    following functionality over the raw neural density estimator:
    - correct the calculation of the log probability such that it compensates for
        the leakage.
    - reject samples that lie outside of the prior bounds.

    Args:
        score_estimator: The score estimator that the posterior is based on.
            If `None`, use the latest neural score estimator that was trained.
        prior: Prior distribution.
        sample_with: Method to use for sampling from the posterior. Can be one of
            'sde' (default) or 'ode'. The 'sde' method uses the score to
            do a Langevin diffusion step, while the 'ode' method uses the score to
            define a probabilistic ODE and solves it with a numerical ODE solver.

    Returns:
        Posterior $p(\theta|x)$  with `.sample()` and `.log_prob()` methods.
    """
    if prior is None:
        assert self._prior is not None, (
            "You did not pass a prior. You have to pass the prior either at "
            "initialization `inference = NPSE(prior)` or to "
            "`.build_posterior(prior=prior)`."
        )
        prior = self._prior
    else:
        utils.check_prior(prior)

    if score_estimator is None:
        score_estimator = self._neural_net
        # If internal net is used device is defined.
        device = self._device
    # Otherwise, infer it from the device of the net parameters.
    else:
        # TODO: Add protocol for checking if the score estimator has forward and
        # loss methods with the correct signature.
        device = str(next(score_estimator.parameters()).device)

    posterior = ScorePosterior(
        score_estimator,  # type: ignore
        prior,
        device=device,
        sample_with=sample_with,
    )

    self._posterior = posterior
    # Store models at end of each round.
    self._model_bank.append(deepcopy(self._posterior))

    return deepcopy(self._posterior)

train(training_batch_size=200, learning_rate=0.0005, validation_fraction=0.1, stop_after_epochs=200, max_num_epochs=2 ** 31 - 1, clip_max_norm=5.0, calibration_kernel=None, ema_loss_decay=0.1, resume_training=False, force_first_round_loss=False, discard_prior_samples=False, retrain_from_scratch=False, show_train_summary=False, dataloader_kwargs=None)

Returns a score estimator that approximates the score \(\nabla_\theta \log p(\theta|x)\).

Parameters:

Name Type Description Default
training_batch_size int

Training batch size.

200
learning_rate float

Learning rate for Adam optimizer.

0.0005
validation_fraction float

The fraction of data to use for validation.

0.1
stop_after_epochs int

The number of epochs to wait for improvement on the validation set before terminating training.

200
max_num_epochs int

Maximum number of epochs to run. If reached, we stop training even when the validation loss is still decreasing. Otherwise, we train until validation loss increases (see also stop_after_epochs).

2 ** 31 - 1
clip_max_norm Optional[float]

Value at which to clip the total gradient norm in order to prevent exploding gradients. Use None for no clipping.

5.0
calibration_kernel Optional[Callable]

A function to calibrate the loss with respect to the simulations x (optional). See Lueckmann, Gonçalves et al., NeurIPS 2017. If None, no calibration is used.

None
resume_training bool

Can be used in case training time is limited, e.g. on a cluster. If True, the split between train and validation set, the optimizer, the number of epochs, and the best validation log-prob will be restored from the last time .train() was called.

False
force_first_round_loss bool

If True, train with maximum likelihood, i.e., potentially ignoring the correction for using a proposal distribution different from the prior.

False
discard_prior_samples bool

Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples.

False
retrain_from_scratch bool

Whether to retrain the conditional density estimator for the posterior from scratch each round.

False
show_train_summary bool

Whether to print the number of epochs and validation loss after the training.

False
dataloader_kwargs Optional[dict]

Additional or updated kwargs to be passed to the training and validation dataloaders (like, e.g., a collate_fn)

None

Returns:

Type Description
ConditionalScoreEstimator

Score estimator that approximates the posterior score.

Source code in sbi/inference/trainers/npse/npse.py
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
def train(
    self,
    training_batch_size: int = 200,
    learning_rate: float = 5e-4,
    validation_fraction: float = 0.1,
    stop_after_epochs: int = 200,
    max_num_epochs: int = 2**31 - 1,
    clip_max_norm: Optional[float] = 5.0,
    calibration_kernel: Optional[Callable] = None,
    ema_loss_decay: float = 0.1,
    resume_training: bool = False,
    force_first_round_loss: bool = False,
    discard_prior_samples: bool = False,
    retrain_from_scratch: bool = False,
    show_train_summary: bool = False,
    dataloader_kwargs: Optional[dict] = None,
) -> ConditionalScoreEstimator:
    r"""Returns a score estimator that approximates the score
    $\nabla_\theta \log p(\theta|x)$.

    Args:
        training_batch_size: Training batch size.
        learning_rate: Learning rate for Adam optimizer.
        validation_fraction: The fraction of data to use for validation.
        stop_after_epochs: The number of epochs to wait for improvement on the
            validation set before terminating training.
        max_num_epochs: Maximum number of epochs to run. If reached, we stop
            training even when the validation loss is still decreasing. Otherwise,
            we train until validation loss increases (see also `stop_after_epochs`).
        clip_max_norm: Value at which to clip the total gradient norm in order to
            prevent exploding gradients. Use None for no clipping.
        calibration_kernel: A function to calibrate the loss with respect
            to the simulations `x` (optional). See Lueckmann, Gonçalves et al.,
            NeurIPS 2017. If `None`, no calibration is used.
        resume_training: Can be used in case training time is limited, e.g. on a
            cluster. If `True`, the split between train and validation set, the
            optimizer, the number of epochs, and the best validation log-prob will
            be restored from the last time `.train()` was called.
        force_first_round_loss: If `True`, train with maximum likelihood,
            i.e., potentially ignoring the correction for using a proposal
            distribution different from the prior.
        discard_prior_samples: Whether to discard samples simulated in round 1, i.e.
            from the prior. Training may be sped up by ignoring such less targeted
            samples.
        retrain_from_scratch: Whether to retrain the conditional density
            estimator for the posterior from scratch each round.
        show_train_summary: Whether to print the number of epochs and validation
            loss after the training.
        dataloader_kwargs: Additional or updated kwargs to be passed to the training
            and validation dataloaders (like, e.g., a collate_fn)

    Returns:
        Score estimator that approximates the posterior score.
    """
    # Load data from most recent round.
    self._round = max(self._data_round_index)

    if self._round == 0 and self._neural_net is not None:
        assert force_first_round_loss or resume_training, (
            "You have already trained this neural network. After you had trained "
            "the network, you again appended simulations with `append_simulations"
            "(theta, x)`, but you did not provide a proposal. If the new "
            "simulations are sampled from the prior, you can set "
            "`.train(..., force_first_round_loss=True`). However, if the new "
            "simulations were not sampled from the prior, you should pass the "
            "proposal, i.e. `append_simulations(theta, x, proposal)`. If "
            "your samples are not sampled from the prior and you do not pass a "
            "proposal and you set `force_first_round_loss=True`, the result of "
            "NPSE will not be the true posterior. Instead, it will be the proposal "
            "posterior, which (usually) is more narrow than the true posterior."
        )

    # Calibration kernels proposed in Lueckmann, Gonçalves et al., 2017.
    if calibration_kernel is None:

        def default_calibration_kernel(x):
            return ones([len(x)], device=self._device)

        calibration_kernel = default_calibration_kernel

    # Starting index for the training set (1 = discard round-0 samples).
    start_idx = int(discard_prior_samples and self._round > 0)

    # Set the proposal to the last proposal that was passed by the user. For
    # atomic SNPE, it does not matter what the proposal is. For non-atomic
    # SNPE, we only use the latest data that was passed, i.e. the one from the
    # last proposal.
    proposal = self._proposal_roundwise[-1]

    train_loader, val_loader = self.get_dataloaders(
        start_idx,
        training_batch_size,
        validation_fraction,
        resume_training,
        dataloader_kwargs=dataloader_kwargs,
    )
    # First round or if retraining from scratch:
    # Call the `self._build_neural_net` with the rounds' thetas and xs as
    # arguments, which will build the neural network.
    if self._neural_net is None or retrain_from_scratch:
        # Get theta,x to initialize NN
        theta, x, _ = self.get_simulations(starting_round=start_idx)
        # Use only training data for building the neural net (z-scoring transforms)

        self._neural_net = self._build_neural_net(
            theta[self.train_indices].to("cpu"),
            x[self.train_indices].to("cpu"),
        )

        test_posterior_net_for_multi_d_x(
            self._neural_net,
            theta.to("cpu"),
            x.to("cpu"),
        )

        del theta, x

    # Move entire net to device for training.
    self._neural_net.to(self._device)

    if not resume_training:
        self.optimizer = Adam(list(self._neural_net.parameters()), lr=learning_rate)

        self.epoch, self._val_loss = 0, float("Inf")

    while self.epoch <= max_num_epochs and not self._converged(
        self.epoch, stop_after_epochs
    ):
        # Train for a single epoch.
        self._neural_net.train()
        train_loss_sum = 0
        epoch_start_time = time.time()
        for batch in train_loader:
            self.optimizer.zero_grad()
            # Get batches on current device.
            theta_batch, x_batch, masks_batch = (
                batch[0].to(self._device),
                batch[1].to(self._device),
                batch[2].to(self._device),
            )

            train_losses = self._loss(
                theta_batch,
                x_batch,
                masks_batch,
                proposal,
                calibration_kernel,
                force_first_round_loss=force_first_round_loss,
            )

            train_loss = torch.mean(train_losses)

            train_loss_sum += train_losses.sum().item()

            train_loss.backward()
            if clip_max_norm is not None:
                clip_grad_norm_(
                    self._neural_net.parameters(), max_norm=clip_max_norm
                )
            self.optimizer.step()

        self.epoch += 1

        train_loss_average = train_loss_sum / (
            len(train_loader) * train_loader.batch_size  # type: ignore
        )

        # NOTE: Due to the inherently noisy nature we do instead log a exponential
        # moving average of the training loss.
        if len(self._summary["training_loss"]) == 0:
            self._summary["training_loss"].append(train_loss_average)
        else:
            previous_loss = self._summary["training_loss"][-1]
            self._summary["training_loss"].append(
                (1.0 - ema_loss_decay) * previous_loss
                + ema_loss_decay * train_loss_average
            )

        # Calculate validation performance.
        self._neural_net.eval()
        val_loss_sum = 0

        with torch.no_grad():
            for batch in val_loader:
                theta_batch, x_batch, masks_batch = (
                    batch[0].to(self._device),
                    batch[1].to(self._device),
                    batch[2].to(self._device),
                )
                # Take negative loss here to get validation log_prob.
                val_losses = self._loss(
                    theta_batch,
                    x_batch,
                    masks_batch,
                    proposal,
                    calibration_kernel,
                    force_first_round_loss=force_first_round_loss,
                )
                val_loss_sum += val_losses.sum().item()

        # Take mean over all validation samples.
        val_loss = val_loss_sum / (
            len(val_loader) * val_loader.batch_size  # type: ignore
        )

        # NOTE: Due to the inherently noisy nature we do instead log a exponential
        # moving average of the validation loss.
        if len(self._summary["validation_loss"]) == 0:
            val_loss_ema = val_loss
        else:
            previous_loss = self._summary["validation_loss"][-1]
            val_loss_ema = (
                1 - ema_loss_decay
            ) * previous_loss + ema_loss_decay * val_loss

        self._val_loss = val_loss_ema
        self._summary["validation_loss"].append(self._val_loss)
        self._summary["epoch_durations_sec"].append(time.time() - epoch_start_time)

        self._maybe_show_progress(self._show_progress_bars, self.epoch)

    self._report_convergence_at_end(self.epoch, stop_after_epochs, max_num_epochs)

    # Update summary.
    self._summary["epochs_trained"].append(self.epoch)
    self._summary["best_validation_loss"].append(self._val_loss)

    # Update tensorboard and summary dict.
    self._summarize(round_=self._round)

    # Update description for progress bar.
    if show_train_summary:
        print(self._describe_round(self._round, self._summary))

    # Avoid keeping the gradients in the resulting network, which can
    # cause memory leakage when benchmarking.
    self._neural_net.zero_grad(set_to_none=True)

    return deepcopy(self._neural_net)

NLE_A

Bases: LikelihoodEstimator

Source code in sbi/inference/trainers/nle/nle_a.py
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
class NLE_A(LikelihoodEstimator):
    def __init__(
        self,
        prior: Optional[Distribution] = None,
        density_estimator: Union[str, Callable] = "maf",
        device: str = "cpu",
        logging_level: Union[int, str] = "WARNING",
        summary_writer: Optional[TensorboardSummaryWriter] = None,
        show_progress_bars: bool = True,
    ):
        r"""Neural Likelihood Estimation [1].

        [1] Sequential Neural Likelihood: Fast Likelihood-free Inference with
        Autoregressive Flows_, Papamakarios et al., AISTATS 2019,
        https://arxiv.org/abs/1805.07226

        Args:
            prior: A probability distribution that expresses prior knowledge about the
                parameters, e.g. which ranges are meaningful for them. If `None`, the
                prior must be passed to `.build_posterior()`.
            density_estimator: If it is a string, use a pre-configured network of the
                provided type (one of nsf, maf, mdn, made). Alternatively, a function
                that builds a custom neural network can be provided. The function will
                be called with the first batch of simulations (theta, x), which can
                thus be used for shape inference and potentially for z-scoring. It
                needs to return a PyTorch `nn.Module` implementing the density
                estimator. The density estimator needs to provide the methods
                `.log_prob` and `.sample()`.
            device: Training device, e.g., "cpu", "cuda" or "cuda:{0, 1, ...}".
            logging_level: Minimum severity of messages to log. One of the strings
                INFO, WARNING, DEBUG, ERROR and CRITICAL.
            summary_writer: A tensorboard `SummaryWriter` to control, among others, log
                file location (default is `<current working directory>/logs`.)
            show_progress_bars: Whether to show a progressbar during simulation and
                sampling.
        """

        kwargs = del_entries(locals(), entries=("self", "__class__"))
        super().__init__(**kwargs)

__init__(prior=None, density_estimator='maf', device='cpu', logging_level='WARNING', summary_writer=None, show_progress_bars=True)

Neural Likelihood Estimation [1].

[1] Sequential Neural Likelihood: Fast Likelihood-free Inference with Autoregressive Flows_, Papamakarios et al., AISTATS 2019, https://arxiv.org/abs/1805.07226

Parameters:

Name Type Description Default
prior Optional[Distribution]

A probability distribution that expresses prior knowledge about the parameters, e.g. which ranges are meaningful for them. If None, the prior must be passed to .build_posterior().

None
density_estimator Union[str, Callable]

If it is a string, use a pre-configured network of the provided type (one of nsf, maf, mdn, made). Alternatively, a function that builds a custom neural network can be provided. The function will be called with the first batch of simulations (theta, x), which can thus be used for shape inference and potentially for z-scoring. It needs to return a PyTorch nn.Module implementing the density estimator. The density estimator needs to provide the methods .log_prob and .sample().

'maf'
device str

Training device, e.g., “cpu”, “cuda” or “cuda:{0, 1, …}”.

'cpu'
logging_level Union[int, str]

Minimum severity of messages to log. One of the strings INFO, WARNING, DEBUG, ERROR and CRITICAL.

'WARNING'
summary_writer Optional[TensorboardSummaryWriter]

A tensorboard SummaryWriter to control, among others, log file location (default is <current working directory>/logs.)

None
show_progress_bars bool

Whether to show a progressbar during simulation and sampling.

True
Source code in sbi/inference/trainers/nle/nle_a.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def __init__(
    self,
    prior: Optional[Distribution] = None,
    density_estimator: Union[str, Callable] = "maf",
    device: str = "cpu",
    logging_level: Union[int, str] = "WARNING",
    summary_writer: Optional[TensorboardSummaryWriter] = None,
    show_progress_bars: bool = True,
):
    r"""Neural Likelihood Estimation [1].

    [1] Sequential Neural Likelihood: Fast Likelihood-free Inference with
    Autoregressive Flows_, Papamakarios et al., AISTATS 2019,
    https://arxiv.org/abs/1805.07226

    Args:
        prior: A probability distribution that expresses prior knowledge about the
            parameters, e.g. which ranges are meaningful for them. If `None`, the
            prior must be passed to `.build_posterior()`.
        density_estimator: If it is a string, use a pre-configured network of the
            provided type (one of nsf, maf, mdn, made). Alternatively, a function
            that builds a custom neural network can be provided. The function will
            be called with the first batch of simulations (theta, x), which can
            thus be used for shape inference and potentially for z-scoring. It
            needs to return a PyTorch `nn.Module` implementing the density
            estimator. The density estimator needs to provide the methods
            `.log_prob` and `.sample()`.
        device: Training device, e.g., "cpu", "cuda" or "cuda:{0, 1, ...}".
        logging_level: Minimum severity of messages to log. One of the strings
            INFO, WARNING, DEBUG, ERROR and CRITICAL.
        summary_writer: A tensorboard `SummaryWriter` to control, among others, log
            file location (default is `<current working directory>/logs`.)
        show_progress_bars: Whether to show a progressbar during simulation and
            sampling.
    """

    kwargs = del_entries(locals(), entries=("self", "__class__"))
    super().__init__(**kwargs)

NRE_A

Bases: RatioEstimator

Source code in sbi/inference/trainers/nre/nre_a.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
class NRE_A(RatioEstimator):
    def __init__(
        self,
        prior: Optional[Distribution] = None,
        classifier: Union[str, Callable] = "resnet",
        device: str = "cpu",
        logging_level: Union[int, str] = "warning",
        summary_writer: Optional[TensorboardSummaryWriter] = None,
        show_progress_bars: bool = True,
    ):
        r"""AALR[1], here known as NRE_A.

        [1] _Likelihood-free MCMC with Amortized Approximate Likelihood Ratios_, Hermans
            et al., ICML 2020, https://arxiv.org/abs/1903.04057

        Args:
            prior: A probability distribution that expresses prior knowledge about the
                parameters, e.g. which ranges are meaningful for them. If `None`, the
                prior must be passed to `.build_posterior()`.
            classifier: Classifier trained to approximate likelihood ratios. If it is
                a string, use a pre-configured network of the provided type (one of
                linear, mlp, resnet). Alternatively, a function that builds a custom
                neural network can be provided. The function will be called with the
                first batch of simulations (theta, x), which can thus be used for shape
                inference and potentially for z-scoring. It needs to return a PyTorch
                `nn.Module` implementing the classifier.
            device: Training device, e.g., "cpu", "cuda" or "cuda:{0, 1, ...}".
            logging_level: Minimum severity of messages to log. One of the strings
                INFO, WARNING, DEBUG, ERROR and CRITICAL.
            summary_writer: A tensorboard `SummaryWriter` to control, among others, log
                file location (default is `<current working directory>/logs`.)
            show_progress_bars: Whether to show a progressbar during simulation and
                sampling.
        """

        kwargs = del_entries(locals(), entries=("self", "__class__"))
        super().__init__(**kwargs)

    def train(
        self,
        training_batch_size: int = 200,
        learning_rate: float = 5e-4,
        validation_fraction: float = 0.1,
        stop_after_epochs: int = 20,
        max_num_epochs: int = 2**31 - 1,
        clip_max_norm: Optional[float] = 5.0,
        resume_training: bool = False,
        discard_prior_samples: bool = False,
        retrain_from_scratch: bool = False,
        show_train_summary: bool = False,
        dataloader_kwargs: Optional[Dict] = None,
        loss_kwargs: Optional[Dict[str, Any]] = None,
    ) -> nn.Module:
        r"""Return classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.

        Args:
            training_batch_size: Training batch size.
            learning_rate: Learning rate for Adam optimizer.
            validation_fraction: The fraction of data to use for validation.
            stop_after_epochs: The number of epochs to wait for improvement on the
                validation set before terminating training.
            max_num_epochs: Maximum number of epochs to run. If reached, we stop
                training even when the validation loss is still decreasing. Otherwise,
                we train until validation loss increases (see also `stop_after_epochs`).
            clip_max_norm: Value at which to clip the total gradient norm in order to
                prevent exploding gradients. Use None for no clipping.
            resume_training: Can be used in case training time is limited, e.g. on a
                cluster. If `True`, the split between train and validation set, the
                optimizer, the number of epochs, and the best validation log-prob will
                be restored from the last time `.train()` was called.
            discard_prior_samples: Whether to discard samples simulated in round 1, i.e.
                from the prior. Training may be sped up by ignoring such less targeted
                samples.
            retrain_from_scratch: Whether to retrain the conditional density
                estimator for the posterior from scratch each round.
            show_train_summary: Whether to print the number of epochs and validation
                loss and leakage after the training.
            dataloader_kwargs: Additional or updated kwargs to be passed to the training
                and validation dataloaders (like, e.g., a collate_fn)
            loss_kwargs: Additional or updated kwargs to be passed to the self._loss fn.

        Returns:
            Classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.
        """

        # AALR is defined for `num_atoms=2`.
        # Proxy to `super().__call__` to ensure right parameter.
        kwargs = del_entries(locals(), entries=("self", "__class__"))
        return super().train(**kwargs, num_atoms=2)

    def _loss(self, theta: Tensor, x: Tensor, num_atoms: int) -> Tensor:
        """Returns the binary cross-entropy loss for the trained classifier.

        The classifier takes as input a $(\theta,x)$ pair. It is trained to predict 1
        if the pair was sampled from the joint $p(\theta,x)$, and to predict 0 if the
        pair was sampled from the marginals $p(\theta)p(x)$.
        """

        assert theta.shape[0] == x.shape[0], "Batch sizes for theta and x must match."
        batch_size = theta.shape[0]

        logits = self._classifier_logits(theta, x, num_atoms)
        likelihood = torch.sigmoid(logits).squeeze()

        # Alternating pairs where there is one sampled from the joint and one
        # sampled from the marginals. The first element is sampled from the
        # joint p(theta, x) and is labelled 1. The second element is sampled
        # from the marginals p(theta)p(x) and is labelled 0. And so on.
        labels = ones(2 * batch_size, device=self._device)  # two atoms
        labels[1::2] = 0.0

        # Binary cross entropy to learn the likelihood (AALR-specific)
        return nn.BCELoss()(likelihood, labels)

__init__(prior=None, classifier='resnet', device='cpu', logging_level='warning', summary_writer=None, show_progress_bars=True)

AALR[1], here known as NRE_A.

[1] Likelihood-free MCMC with Amortized Approximate Likelihood Ratios, Hermans et al., ICML 2020, https://arxiv.org/abs/1903.04057

Parameters:

Name Type Description Default
prior Optional[Distribution]

A probability distribution that expresses prior knowledge about the parameters, e.g. which ranges are meaningful for them. If None, the prior must be passed to .build_posterior().

None
classifier Union[str, Callable]

Classifier trained to approximate likelihood ratios. If it is a string, use a pre-configured network of the provided type (one of linear, mlp, resnet). Alternatively, a function that builds a custom neural network can be provided. The function will be called with the first batch of simulations (theta, x), which can thus be used for shape inference and potentially for z-scoring. It needs to return a PyTorch nn.Module implementing the classifier.

'resnet'
device str

Training device, e.g., “cpu”, “cuda” or “cuda:{0, 1, …}”.

'cpu'
logging_level Union[int, str]

Minimum severity of messages to log. One of the strings INFO, WARNING, DEBUG, ERROR and CRITICAL.

'warning'
summary_writer Optional[TensorboardSummaryWriter]

A tensorboard SummaryWriter to control, among others, log file location (default is <current working directory>/logs.)

None
show_progress_bars bool

Whether to show a progressbar during simulation and sampling.

True
Source code in sbi/inference/trainers/nre/nre_a.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def __init__(
    self,
    prior: Optional[Distribution] = None,
    classifier: Union[str, Callable] = "resnet",
    device: str = "cpu",
    logging_level: Union[int, str] = "warning",
    summary_writer: Optional[TensorboardSummaryWriter] = None,
    show_progress_bars: bool = True,
):
    r"""AALR[1], here known as NRE_A.

    [1] _Likelihood-free MCMC with Amortized Approximate Likelihood Ratios_, Hermans
        et al., ICML 2020, https://arxiv.org/abs/1903.04057

    Args:
        prior: A probability distribution that expresses prior knowledge about the
            parameters, e.g. which ranges are meaningful for them. If `None`, the
            prior must be passed to `.build_posterior()`.
        classifier: Classifier trained to approximate likelihood ratios. If it is
            a string, use a pre-configured network of the provided type (one of
            linear, mlp, resnet). Alternatively, a function that builds a custom
            neural network can be provided. The function will be called with the
            first batch of simulations (theta, x), which can thus be used for shape
            inference and potentially for z-scoring. It needs to return a PyTorch
            `nn.Module` implementing the classifier.
        device: Training device, e.g., "cpu", "cuda" or "cuda:{0, 1, ...}".
        logging_level: Minimum severity of messages to log. One of the strings
            INFO, WARNING, DEBUG, ERROR and CRITICAL.
        summary_writer: A tensorboard `SummaryWriter` to control, among others, log
            file location (default is `<current working directory>/logs`.)
        show_progress_bars: Whether to show a progressbar during simulation and
            sampling.
    """

    kwargs = del_entries(locals(), entries=("self", "__class__"))
    super().__init__(**kwargs)

train(training_batch_size=200, learning_rate=0.0005, validation_fraction=0.1, stop_after_epochs=20, max_num_epochs=2 ** 31 - 1, clip_max_norm=5.0, resume_training=False, discard_prior_samples=False, retrain_from_scratch=False, show_train_summary=False, dataloader_kwargs=None, loss_kwargs=None)

Return classifier that approximates the ratio \(p(\theta,x)/p(\theta)p(x)\).

Parameters:

Name Type Description Default
training_batch_size int

Training batch size.

200
learning_rate float

Learning rate for Adam optimizer.

0.0005
validation_fraction float

The fraction of data to use for validation.

0.1
stop_after_epochs int

The number of epochs to wait for improvement on the validation set before terminating training.

20
max_num_epochs int

Maximum number of epochs to run. If reached, we stop training even when the validation loss is still decreasing. Otherwise, we train until validation loss increases (see also stop_after_epochs).

2 ** 31 - 1
clip_max_norm Optional[float]

Value at which to clip the total gradient norm in order to prevent exploding gradients. Use None for no clipping.

5.0
resume_training bool

Can be used in case training time is limited, e.g. on a cluster. If True, the split between train and validation set, the optimizer, the number of epochs, and the best validation log-prob will be restored from the last time .train() was called.

False
discard_prior_samples bool

Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples.

False
retrain_from_scratch bool

Whether to retrain the conditional density estimator for the posterior from scratch each round.

False
show_train_summary bool

Whether to print the number of epochs and validation loss and leakage after the training.

False
dataloader_kwargs Optional[Dict]

Additional or updated kwargs to be passed to the training and validation dataloaders (like, e.g., a collate_fn)

None
loss_kwargs Optional[Dict[str, Any]]

Additional or updated kwargs to be passed to the self._loss fn.

None

Returns:

Type Description
Module

Classifier that approximates the ratio \(p(\theta,x)/p(\theta)p(x)\).

Source code in sbi/inference/trainers/nre/nre_a.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
def train(
    self,
    training_batch_size: int = 200,
    learning_rate: float = 5e-4,
    validation_fraction: float = 0.1,
    stop_after_epochs: int = 20,
    max_num_epochs: int = 2**31 - 1,
    clip_max_norm: Optional[float] = 5.0,
    resume_training: bool = False,
    discard_prior_samples: bool = False,
    retrain_from_scratch: bool = False,
    show_train_summary: bool = False,
    dataloader_kwargs: Optional[Dict] = None,
    loss_kwargs: Optional[Dict[str, Any]] = None,
) -> nn.Module:
    r"""Return classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.

    Args:
        training_batch_size: Training batch size.
        learning_rate: Learning rate for Adam optimizer.
        validation_fraction: The fraction of data to use for validation.
        stop_after_epochs: The number of epochs to wait for improvement on the
            validation set before terminating training.
        max_num_epochs: Maximum number of epochs to run. If reached, we stop
            training even when the validation loss is still decreasing. Otherwise,
            we train until validation loss increases (see also `stop_after_epochs`).
        clip_max_norm: Value at which to clip the total gradient norm in order to
            prevent exploding gradients. Use None for no clipping.
        resume_training: Can be used in case training time is limited, e.g. on a
            cluster. If `True`, the split between train and validation set, the
            optimizer, the number of epochs, and the best validation log-prob will
            be restored from the last time `.train()` was called.
        discard_prior_samples: Whether to discard samples simulated in round 1, i.e.
            from the prior. Training may be sped up by ignoring such less targeted
            samples.
        retrain_from_scratch: Whether to retrain the conditional density
            estimator for the posterior from scratch each round.
        show_train_summary: Whether to print the number of epochs and validation
            loss and leakage after the training.
        dataloader_kwargs: Additional or updated kwargs to be passed to the training
            and validation dataloaders (like, e.g., a collate_fn)
        loss_kwargs: Additional or updated kwargs to be passed to the self._loss fn.

    Returns:
        Classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.
    """

    # AALR is defined for `num_atoms=2`.
    # Proxy to `super().__call__` to ensure right parameter.
    kwargs = del_entries(locals(), entries=("self", "__class__"))
    return super().train(**kwargs, num_atoms=2)

NRE_B

Bases: RatioEstimator

Source code in sbi/inference/trainers/nre/nre_b.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
class NRE_B(RatioEstimator):
    def __init__(
        self,
        prior: Optional[Distribution] = None,
        classifier: Union[str, Callable] = "resnet",
        device: str = "cpu",
        logging_level: Union[int, str] = "warning",
        summary_writer: Optional[TensorboardSummaryWriter] = None,
        show_progress_bars: bool = True,
    ):
        r"""SRE[1], here known as NRE_B.

        [1] _On Contrastive Learning for Likelihood-free Inference_, Durkan et al.,
            ICML 2020, https://arxiv.org/pdf/2002.03712

        Args:
            prior: A probability distribution that expresses prior knowledge about the
                parameters, e.g. which ranges are meaningful for them. If `None`, the
                prior must be passed to `.build_posterior()`.
            classifier: Classifier trained to approximate likelihood ratios. If it is
                a string, use a pre-configured network of the provided type (one of
                linear, mlp, resnet). Alternatively, a function that builds a custom
                neural network can be provided. The function will be called with the
                first batch of simulations (theta, x), which can thus be used for shape
                inference and potentially for z-scoring. It needs to return a PyTorch
                `nn.Module` implementing the classifier.
            device: Training device, e.g., "cpu", "cuda" or "cuda:{0, 1, ...}".
            logging_level: Minimum severity of messages to log. One of the strings
                INFO, WARNING, DEBUG, ERROR and CRITICAL.
            summary_writer: A tensorboard `SummaryWriter` to control, among others, log
                file location (default is `<current working directory>/logs`.)
            show_progress_bars: Whether to show a progressbar during simulation and
                sampling.
        """

        kwargs = del_entries(locals(), entries=("self", "__class__"))
        super().__init__(**kwargs)

    def train(
        self,
        num_atoms: int = 10,
        training_batch_size: int = 200,
        learning_rate: float = 5e-4,
        validation_fraction: float = 0.1,
        stop_after_epochs: int = 20,
        max_num_epochs: int = 2**31 - 1,
        clip_max_norm: Optional[float] = 5.0,
        resume_training: bool = False,
        discard_prior_samples: bool = False,
        retrain_from_scratch: bool = False,
        show_train_summary: bool = False,
        dataloader_kwargs: Optional[Dict] = None,
    ) -> nn.Module:
        r"""Return classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.

        Args:
            num_atoms: Number of atoms to use for classification.
            training_batch_size: Training batch size.
            learning_rate: Learning rate for Adam optimizer.
            validation_fraction: The fraction of data to use for validation.
            stop_after_epochs: The number of epochs to wait for improvement on the
                validation set before terminating training.
            max_num_epochs: Maximum number of epochs to run. If reached, we stop
                training even when the validation loss is still decreasing. Otherwise,
                we train until validation loss increases (see also `stop_after_epochs`).
            clip_max_norm: Value at which to clip the total gradient norm in order to
                prevent exploding gradients. Use None for no clipping.
            resume_training: Can be used in case training time is limited, e.g. on a
                cluster. If `True`, the split between train and validation set, the
                optimizer, the number of epochs, and the best validation log-prob will
                be restored from the last time `.train()` was called.
            discard_prior_samples: Whether to discard samples simulated in round 1, i.e.
                from the prior. Training may be sped up by ignoring such less targeted
                samples.
            retrain_from_scratch: Whether to retrain the conditional density
                estimator for the posterior from scratch each round.
            show_train_summary: Whether to print the number of epochs and validation
                loss and leakage after the training.
            dataloader_kwargs: Additional or updated kwargs to be passed to the training
                and validation dataloaders (like, e.g., a collate_fn)

        Returns:
            Classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.
        """
        kwargs = del_entries(locals(), entries=("self", "__class__"))
        return super().train(**kwargs)

    def _loss(self, theta: Tensor, x: Tensor, num_atoms: int) -> Tensor:
        r"""Return cross-entropy (via softmax activation) loss for 1-out-of-`num_atoms`
        classification.

        The classifier takes as input `num_atoms` $(\theta,x)$ pairs. Out of these
        pairs, one pair was sampled from the joint $p(\theta,x)$ and all others from the
        marginals $p(\theta)p(x)$. The classifier is trained to predict which of the
        pairs was sampled from the joint $p(\theta,x)$.
        """

        assert theta.shape[0] == x.shape[0], "Batch sizes for theta and x must match."
        batch_size = theta.shape[0]
        logits = self._classifier_logits(theta, x, num_atoms)

        # For 1-out-of-`num_atoms` classification each datapoint consists
        # of `num_atoms` points, with one of them being the correct one.
        # We have a batch of `batch_size` such datapoints.
        logits = logits.reshape(batch_size, num_atoms)

        # Index 0 is the theta-x-pair sampled from the joint p(theta,x) and hence the
        # "correct" one for the 1-out-of-N classification.
        log_prob = logits[:, 0] - torch.logsumexp(logits, dim=-1)

        return -torch.mean(log_prob)

__init__(prior=None, classifier='resnet', device='cpu', logging_level='warning', summary_writer=None, show_progress_bars=True)

SRE[1], here known as NRE_B.

[1] On Contrastive Learning for Likelihood-free Inference, Durkan et al., ICML 2020, https://arxiv.org/pdf/2002.03712

Parameters:

Name Type Description Default
prior Optional[Distribution]

A probability distribution that expresses prior knowledge about the parameters, e.g. which ranges are meaningful for them. If None, the prior must be passed to .build_posterior().

None
classifier Union[str, Callable]

Classifier trained to approximate likelihood ratios. If it is a string, use a pre-configured network of the provided type (one of linear, mlp, resnet). Alternatively, a function that builds a custom neural network can be provided. The function will be called with the first batch of simulations (theta, x), which can thus be used for shape inference and potentially for z-scoring. It needs to return a PyTorch nn.Module implementing the classifier.

'resnet'
device str

Training device, e.g., “cpu”, “cuda” or “cuda:{0, 1, …}”.

'cpu'
logging_level Union[int, str]

Minimum severity of messages to log. One of the strings INFO, WARNING, DEBUG, ERROR and CRITICAL.

'warning'
summary_writer Optional[TensorboardSummaryWriter]

A tensorboard SummaryWriter to control, among others, log file location (default is <current working directory>/logs.)

None
show_progress_bars bool

Whether to show a progressbar during simulation and sampling.

True
Source code in sbi/inference/trainers/nre/nre_b.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
def __init__(
    self,
    prior: Optional[Distribution] = None,
    classifier: Union[str, Callable] = "resnet",
    device: str = "cpu",
    logging_level: Union[int, str] = "warning",
    summary_writer: Optional[TensorboardSummaryWriter] = None,
    show_progress_bars: bool = True,
):
    r"""SRE[1], here known as NRE_B.

    [1] _On Contrastive Learning for Likelihood-free Inference_, Durkan et al.,
        ICML 2020, https://arxiv.org/pdf/2002.03712

    Args:
        prior: A probability distribution that expresses prior knowledge about the
            parameters, e.g. which ranges are meaningful for them. If `None`, the
            prior must be passed to `.build_posterior()`.
        classifier: Classifier trained to approximate likelihood ratios. If it is
            a string, use a pre-configured network of the provided type (one of
            linear, mlp, resnet). Alternatively, a function that builds a custom
            neural network can be provided. The function will be called with the
            first batch of simulations (theta, x), which can thus be used for shape
            inference and potentially for z-scoring. It needs to return a PyTorch
            `nn.Module` implementing the classifier.
        device: Training device, e.g., "cpu", "cuda" or "cuda:{0, 1, ...}".
        logging_level: Minimum severity of messages to log. One of the strings
            INFO, WARNING, DEBUG, ERROR and CRITICAL.
        summary_writer: A tensorboard `SummaryWriter` to control, among others, log
            file location (default is `<current working directory>/logs`.)
        show_progress_bars: Whether to show a progressbar during simulation and
            sampling.
    """

    kwargs = del_entries(locals(), entries=("self", "__class__"))
    super().__init__(**kwargs)

train(num_atoms=10, training_batch_size=200, learning_rate=0.0005, validation_fraction=0.1, stop_after_epochs=20, max_num_epochs=2 ** 31 - 1, clip_max_norm=5.0, resume_training=False, discard_prior_samples=False, retrain_from_scratch=False, show_train_summary=False, dataloader_kwargs=None)

Return classifier that approximates the ratio \(p(\theta,x)/p(\theta)p(x)\).

Parameters:

Name Type Description Default
num_atoms int

Number of atoms to use for classification.

10
training_batch_size int

Training batch size.

200
learning_rate float

Learning rate for Adam optimizer.

0.0005
validation_fraction float

The fraction of data to use for validation.

0.1
stop_after_epochs int

The number of epochs to wait for improvement on the validation set before terminating training.

20
max_num_epochs int

Maximum number of epochs to run. If reached, we stop training even when the validation loss is still decreasing. Otherwise, we train until validation loss increases (see also stop_after_epochs).

2 ** 31 - 1
clip_max_norm Optional[float]

Value at which to clip the total gradient norm in order to prevent exploding gradients. Use None for no clipping.

5.0
resume_training bool

Can be used in case training time is limited, e.g. on a cluster. If True, the split between train and validation set, the optimizer, the number of epochs, and the best validation log-prob will be restored from the last time .train() was called.

False
discard_prior_samples bool

Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples.

False
retrain_from_scratch bool

Whether to retrain the conditional density estimator for the posterior from scratch each round.

False
show_train_summary bool

Whether to print the number of epochs and validation loss and leakage after the training.

False
dataloader_kwargs Optional[Dict]

Additional or updated kwargs to be passed to the training and validation dataloaders (like, e.g., a collate_fn)

None

Returns:

Type Description
Module

Classifier that approximates the ratio \(p(\theta,x)/p(\theta)p(x)\).

Source code in sbi/inference/trainers/nre/nre_b.py
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
def train(
    self,
    num_atoms: int = 10,
    training_batch_size: int = 200,
    learning_rate: float = 5e-4,
    validation_fraction: float = 0.1,
    stop_after_epochs: int = 20,
    max_num_epochs: int = 2**31 - 1,
    clip_max_norm: Optional[float] = 5.0,
    resume_training: bool = False,
    discard_prior_samples: bool = False,
    retrain_from_scratch: bool = False,
    show_train_summary: bool = False,
    dataloader_kwargs: Optional[Dict] = None,
) -> nn.Module:
    r"""Return classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.

    Args:
        num_atoms: Number of atoms to use for classification.
        training_batch_size: Training batch size.
        learning_rate: Learning rate for Adam optimizer.
        validation_fraction: The fraction of data to use for validation.
        stop_after_epochs: The number of epochs to wait for improvement on the
            validation set before terminating training.
        max_num_epochs: Maximum number of epochs to run. If reached, we stop
            training even when the validation loss is still decreasing. Otherwise,
            we train until validation loss increases (see also `stop_after_epochs`).
        clip_max_norm: Value at which to clip the total gradient norm in order to
            prevent exploding gradients. Use None for no clipping.
        resume_training: Can be used in case training time is limited, e.g. on a
            cluster. If `True`, the split between train and validation set, the
            optimizer, the number of epochs, and the best validation log-prob will
            be restored from the last time `.train()` was called.
        discard_prior_samples: Whether to discard samples simulated in round 1, i.e.
            from the prior. Training may be sped up by ignoring such less targeted
            samples.
        retrain_from_scratch: Whether to retrain the conditional density
            estimator for the posterior from scratch each round.
        show_train_summary: Whether to print the number of epochs and validation
            loss and leakage after the training.
        dataloader_kwargs: Additional or updated kwargs to be passed to the training
            and validation dataloaders (like, e.g., a collate_fn)

    Returns:
        Classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.
    """
    kwargs = del_entries(locals(), entries=("self", "__class__"))
    return super().train(**kwargs)

NRE_C

Bases: RatioEstimator

Source code in sbi/inference/trainers/nre/nre_c.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
class NRE_C(RatioEstimator):
    def __init__(
        self,
        prior: Optional[Distribution] = None,
        classifier: Union[str, Callable] = "resnet",
        device: str = "cpu",
        logging_level: Union[int, str] = "warning",
        summary_writer: Optional[TensorboardSummaryWriter] = None,
        show_progress_bars: bool = True,
    ):
        r"""NRE-C[1] is a generalization of the non-sequential (amortized) versions of
        NRE_A and NRE_B. We call the algorithm NRE_C within `sbi`.

        NRE-C:
        (1) like NRE_B, features a "multiclass" loss function where several marginally
            drawn parameter-data pairs are contrasted against a jointly drawn pair.
        (2) like AALR/NRE_A, i.e., the non-sequential version of NRE_A, it encourages
            the approximate ratio $p(\theta,x)/p(\theta)p(x)$, accessed through
            `.potential()` within `sbi`, to be exact at optimum. This addresses the
            issue that NRE_B estimates this ratio only up to an arbitrary function
            (normalizing constant) of the data $x$.

        Just like for all ratio estimation algorithms, the sequential version of NRE_C
        will be estimated only up to a function (normalizing constant) of the data $x$
        in rounds after the first.

        [1] _Contrastive Neural Ratio Estimation_, Benajmin Kurt Miller, et. al.,
            NeurIPS 2022, https://arxiv.org/abs/2210.06170

        Args:
            prior: A probability distribution that expresses prior knowledge about the
                parameters, e.g. which ranges are meaningful for them. If `None`, the
                prior must be passed to `.build_posterior()`.
            classifier: Classifier trained to approximate likelihood ratios. If it is
                a string, use a pre-configured network of the provided type (one of
                linear, mlp, resnet). Alternatively, a function that builds a custom
                neural network can be provided. The function will be called with the
                first batch of simulations (theta, x), which can thus be used for shape
                inference and potentially for z-scoring. It needs to return a PyTorch
                `nn.Module` implementing the classifier.
            device: Training device, e.g., "cpu", "cuda" or "cuda:{0, 1, ...}".
            logging_level: Minimum severity of messages to log. One of the strings
                INFO, WARNING, DEBUG, ERROR and CRITICAL.
            summary_writer: A tensorboard `SummaryWriter` to control, among others, log
                file location (default is `<current working directory>/logs`.)
            show_progress_bars: Whether to show a progressbar during simulation and
                sampling.
        """

        kwargs = del_entries(locals(), entries=("self", "__class__"))
        super().__init__(**kwargs)

    def train(
        self,
        num_classes: int = 5,
        gamma: float = 1.0,
        training_batch_size: int = 200,
        learning_rate: float = 5e-4,
        validation_fraction: float = 0.1,
        stop_after_epochs: int = 20,
        max_num_epochs: int = 2**31 - 1,
        clip_max_norm: Optional[float] = 5.0,
        resume_training: bool = False,
        discard_prior_samples: bool = False,
        retrain_from_scratch: bool = False,
        show_train_summary: bool = False,
        dataloader_kwargs: Optional[Dict] = None,
    ) -> nn.Module:
        r"""Return classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.

        Args:
            num_classes: Number of theta to classify against, corresponds to $K$ in
                _Contrastive Neural Ratio Estimation_. Minimum value is 1. Similar to
                `num_atoms` for SNRE_B except SNRE_C has an additional independently
                drawn sample. The total number of alternative parameters `NRE-C` "sees"
                is $2K-1$ or `2 * num_classes - 1` divided between two loss terms.
            gamma: Determines the relative weight of the sum of all $K$ dependently
                drawn classes against the marginally drawn one. Specifically,
                $p(y=k) :=p_K$, $p(y=0) := p_0$, $p_0 = 1 - K p_K$, and finally
                $\gamma := K p_K / p_0$.
            training_batch_size: Training batch size.
            learning_rate: Learning rate for Adam optimizer.
            validation_fraction: The fraction of data to use for validation.
            stop_after_epochs: The number of epochs to wait for improvement on the
                validation set before terminating training.
            max_num_epochs: Maximum number of epochs to run. If reached, we stop
                training even when the validation loss is still decreasing. Otherwise,
                we train until validation loss increases (see also `stop_after_epochs`).
            clip_max_norm: Value at which to clip the total gradient norm in order to
                prevent exploding gradients. Use None for no clipping.
            exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞`
                during training. Expect errors, silent or explicit, when `False`.
            resume_training: Can be used in case training time is limited, e.g. on a
                cluster. If `True`, the split between train and validation set, the
                optimizer, the number of epochs, and the best validation log-prob will
                be restored from the last time `.train()` was called.
            discard_prior_samples: Whether to discard samples simulated in round 1, i.e.
                from the prior. Training may be sped up by ignoring such less targeted
                samples.
            retrain_from_scratch: Whether to retrain the conditional density
                estimator for the posterior from scratch each round.
            show_train_summary: Whether to print the number of epochs and validation
                loss and leakage after the training.
            dataloader_kwargs: Additional or updated kwargs to be passed to the training
                and validation dataloaders (like, e.g., a collate_fn)

        Returns:
            Classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.
        """
        kwargs = del_entries(locals(), entries=("self", "__class__"))
        kwargs["num_atoms"] = kwargs.pop("num_classes") + 1
        kwargs["loss_kwargs"] = {"gamma": kwargs.pop("gamma")}
        return super().train(**kwargs)

    def _loss(
        self, theta: Tensor, x: Tensor, num_atoms: int, gamma: float
    ) -> torch.Tensor:
        r"""Return cross-entropy loss (via ''multi-class sigmoid'' activation) for
        1-out-of-`K + 1` classification.

        At optimum, this loss function returns the exact likelihood-to-evidence ratio
        in the first round.
        Details of loss computation are described in Contrastive Neural Ratio
        Estimation[1]. The paper does not discuss the sequential case.

        [1] _Contrastive Neural Ratio Estimation_, Benajmin Kurt Miller, et. al.,
            NeurIPS 2022, https://arxiv.org/abs/2210.06170
        """

        # Reminder: K = num_classes
        # The algorithm is written with K, so we convert back to K format rather than
        # reasoning in num_atoms.
        num_classes = num_atoms - 1
        assert num_classes >= 1, f"num_classes = {num_classes} must be greater than 1."

        assert theta.shape[0] == x.shape[0], "Batch sizes for theta and x must match."
        batch_size = theta.shape[0]

        # We append a contrastive theta to the marginal case because we will remove
        # the jointly drawn
        # sample in the logits_marginal[:, 0] position. That makes the remaining sample
        # marginally drawn.
        # We have a batch of `batch_size` datapoints.
        logits_marginal = self._classifier_logits(theta, x, num_classes + 1).reshape(
            batch_size, num_classes + 1
        )
        logits_joint = self._classifier_logits(theta, x, num_classes).reshape(
            batch_size, num_classes
        )

        dtype = logits_marginal.dtype
        device = logits_marginal.device

        # Index 0 is the theta-x-pair sampled from the joint p(theta,x) and hence
        # we remove the jointly drawn sample from the logits_marginal
        logits_marginal = logits_marginal[:, 1:]
        # ... and retain it in the logits_joint. Now we have two arrays with K choices.

        # To use logsumexp, we extend the denominator logits with loggamma
        loggamma = torch.tensor(gamma, dtype=dtype, device=device).log()
        logK = torch.tensor(num_classes, dtype=dtype, device=device).log()
        denominator_marginal = torch.concat(
            [loggamma + logits_marginal, logK.expand((batch_size, 1))],
            dim=-1,
        )
        denominator_joint = torch.concat(
            [loggamma + logits_joint, logK.expand((batch_size, 1))],
            dim=-1,
        )

        # Compute the contributions to the loss from each term in the classification.
        log_prob_marginal = logK - torch.logsumexp(denominator_marginal, dim=-1)
        log_prob_joint = (
            loggamma + logits_joint[:, 0] - torch.logsumexp(denominator_joint, dim=-1)
        )

        # relative weights. p_marginal := p_0, and p_joint := p_K * K from the notation.
        p_marginal, p_joint = self._get_prior_probs_marginal_and_joint(gamma)
        return -torch.mean(p_marginal * log_prob_marginal + p_joint * log_prob_joint)

    @staticmethod
    def _get_prior_probs_marginal_and_joint(gamma: float) -> Tuple[float, float]:
        r"""Return a tuple (p_marginal, p_joint) where `p_marginal := `$p_0$,
        `p_joint := `$p_K \cdot K$.

        We let the joint (dependently drawn) class to be equally likely across K
        options. The marginal class is therefore restricted to get the remaining
        probability.
        """
        p_joint = gamma / (1 + gamma)
        p_marginal = 1 / (1 + gamma)
        return p_marginal, p_joint

__init__(prior=None, classifier='resnet', device='cpu', logging_level='warning', summary_writer=None, show_progress_bars=True)

NRE-C[1] is a generalization of the non-sequential (amortized) versions of NRE_A and NRE_B. We call the algorithm NRE_C within sbi.

NRE-C: (1) like NRE_B, features a “multiclass” loss function where several marginally drawn parameter-data pairs are contrasted against a jointly drawn pair. (2) like AALR/NRE_A, i.e., the non-sequential version of NRE_A, it encourages the approximate ratio \(p(\theta,x)/p(\theta)p(x)\), accessed through .potential() within sbi, to be exact at optimum. This addresses the issue that NRE_B estimates this ratio only up to an arbitrary function (normalizing constant) of the data \(x\).

Just like for all ratio estimation algorithms, the sequential version of NRE_C will be estimated only up to a function (normalizing constant) of the data \(x\) in rounds after the first.

[1] Contrastive Neural Ratio Estimation, Benajmin Kurt Miller, et. al., NeurIPS 2022, https://arxiv.org/abs/2210.06170

Parameters:

Name Type Description Default
prior Optional[Distribution]

A probability distribution that expresses prior knowledge about the parameters, e.g. which ranges are meaningful for them. If None, the prior must be passed to .build_posterior().

None
classifier Union[str, Callable]

Classifier trained to approximate likelihood ratios. If it is a string, use a pre-configured network of the provided type (one of linear, mlp, resnet). Alternatively, a function that builds a custom neural network can be provided. The function will be called with the first batch of simulations (theta, x), which can thus be used for shape inference and potentially for z-scoring. It needs to return a PyTorch nn.Module implementing the classifier.

'resnet'
device str

Training device, e.g., “cpu”, “cuda” or “cuda:{0, 1, …}”.

'cpu'
logging_level Union[int, str]

Minimum severity of messages to log. One of the strings INFO, WARNING, DEBUG, ERROR and CRITICAL.

'warning'
summary_writer Optional[TensorboardSummaryWriter]

A tensorboard SummaryWriter to control, among others, log file location (default is <current working directory>/logs.)

None
show_progress_bars bool

Whether to show a progressbar during simulation and sampling.

True
Source code in sbi/inference/trainers/nre/nre_c.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
def __init__(
    self,
    prior: Optional[Distribution] = None,
    classifier: Union[str, Callable] = "resnet",
    device: str = "cpu",
    logging_level: Union[int, str] = "warning",
    summary_writer: Optional[TensorboardSummaryWriter] = None,
    show_progress_bars: bool = True,
):
    r"""NRE-C[1] is a generalization of the non-sequential (amortized) versions of
    NRE_A and NRE_B. We call the algorithm NRE_C within `sbi`.

    NRE-C:
    (1) like NRE_B, features a "multiclass" loss function where several marginally
        drawn parameter-data pairs are contrasted against a jointly drawn pair.
    (2) like AALR/NRE_A, i.e., the non-sequential version of NRE_A, it encourages
        the approximate ratio $p(\theta,x)/p(\theta)p(x)$, accessed through
        `.potential()` within `sbi`, to be exact at optimum. This addresses the
        issue that NRE_B estimates this ratio only up to an arbitrary function
        (normalizing constant) of the data $x$.

    Just like for all ratio estimation algorithms, the sequential version of NRE_C
    will be estimated only up to a function (normalizing constant) of the data $x$
    in rounds after the first.

    [1] _Contrastive Neural Ratio Estimation_, Benajmin Kurt Miller, et. al.,
        NeurIPS 2022, https://arxiv.org/abs/2210.06170

    Args:
        prior: A probability distribution that expresses prior knowledge about the
            parameters, e.g. which ranges are meaningful for them. If `None`, the
            prior must be passed to `.build_posterior()`.
        classifier: Classifier trained to approximate likelihood ratios. If it is
            a string, use a pre-configured network of the provided type (one of
            linear, mlp, resnet). Alternatively, a function that builds a custom
            neural network can be provided. The function will be called with the
            first batch of simulations (theta, x), which can thus be used for shape
            inference and potentially for z-scoring. It needs to return a PyTorch
            `nn.Module` implementing the classifier.
        device: Training device, e.g., "cpu", "cuda" or "cuda:{0, 1, ...}".
        logging_level: Minimum severity of messages to log. One of the strings
            INFO, WARNING, DEBUG, ERROR and CRITICAL.
        summary_writer: A tensorboard `SummaryWriter` to control, among others, log
            file location (default is `<current working directory>/logs`.)
        show_progress_bars: Whether to show a progressbar during simulation and
            sampling.
    """

    kwargs = del_entries(locals(), entries=("self", "__class__"))
    super().__init__(**kwargs)

train(num_classes=5, gamma=1.0, training_batch_size=200, learning_rate=0.0005, validation_fraction=0.1, stop_after_epochs=20, max_num_epochs=2 ** 31 - 1, clip_max_norm=5.0, resume_training=False, discard_prior_samples=False, retrain_from_scratch=False, show_train_summary=False, dataloader_kwargs=None)

Return classifier that approximates the ratio \(p(\theta,x)/p(\theta)p(x)\).

Parameters:

Name Type Description Default
num_classes int

Number of theta to classify against, corresponds to \(K\) in Contrastive Neural Ratio Estimation. Minimum value is 1. Similar to num_atoms for SNRE_B except SNRE_C has an additional independently drawn sample. The total number of alternative parameters NRE-C “sees” is \(2K-1\) or 2 * num_classes - 1 divided between two loss terms.

5
gamma float

Determines the relative weight of the sum of all \(K\) dependently drawn classes against the marginally drawn one. Specifically, \(p(y=k) :=p_K\), \(p(y=0) := p_0\), \(p_0 = 1 - K p_K\), and finally \(\gamma := K p_K / p_0\).

1.0
training_batch_size int

Training batch size.

200
learning_rate float

Learning rate for Adam optimizer.

0.0005
validation_fraction float

The fraction of data to use for validation.

0.1
stop_after_epochs int

The number of epochs to wait for improvement on the validation set before terminating training.

20
max_num_epochs int

Maximum number of epochs to run. If reached, we stop training even when the validation loss is still decreasing. Otherwise, we train until validation loss increases (see also stop_after_epochs).

2 ** 31 - 1
clip_max_norm Optional[float]

Value at which to clip the total gradient norm in order to prevent exploding gradients. Use None for no clipping.

5.0
exclude_invalid_x

Whether to exclude simulation outputs x=NaN or x=±∞ during training. Expect errors, silent or explicit, when False.

required
resume_training bool

Can be used in case training time is limited, e.g. on a cluster. If True, the split between train and validation set, the optimizer, the number of epochs, and the best validation log-prob will be restored from the last time .train() was called.

False
discard_prior_samples bool

Whether to discard samples simulated in round 1, i.e. from the prior. Training may be sped up by ignoring such less targeted samples.

False
retrain_from_scratch bool

Whether to retrain the conditional density estimator for the posterior from scratch each round.

False
show_train_summary bool

Whether to print the number of epochs and validation loss and leakage after the training.

False
dataloader_kwargs Optional[Dict]

Additional or updated kwargs to be passed to the training and validation dataloaders (like, e.g., a collate_fn)

None

Returns:

Type Description
Module

Classifier that approximates the ratio \(p(\theta,x)/p(\theta)p(x)\).

Source code in sbi/inference/trainers/nre/nre_c.py
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
def train(
    self,
    num_classes: int = 5,
    gamma: float = 1.0,
    training_batch_size: int = 200,
    learning_rate: float = 5e-4,
    validation_fraction: float = 0.1,
    stop_after_epochs: int = 20,
    max_num_epochs: int = 2**31 - 1,
    clip_max_norm: Optional[float] = 5.0,
    resume_training: bool = False,
    discard_prior_samples: bool = False,
    retrain_from_scratch: bool = False,
    show_train_summary: bool = False,
    dataloader_kwargs: Optional[Dict] = None,
) -> nn.Module:
    r"""Return classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.

    Args:
        num_classes: Number of theta to classify against, corresponds to $K$ in
            _Contrastive Neural Ratio Estimation_. Minimum value is 1. Similar to
            `num_atoms` for SNRE_B except SNRE_C has an additional independently
            drawn sample. The total number of alternative parameters `NRE-C` "sees"
            is $2K-1$ or `2 * num_classes - 1` divided between two loss terms.
        gamma: Determines the relative weight of the sum of all $K$ dependently
            drawn classes against the marginally drawn one. Specifically,
            $p(y=k) :=p_K$, $p(y=0) := p_0$, $p_0 = 1 - K p_K$, and finally
            $\gamma := K p_K / p_0$.
        training_batch_size: Training batch size.
        learning_rate: Learning rate for Adam optimizer.
        validation_fraction: The fraction of data to use for validation.
        stop_after_epochs: The number of epochs to wait for improvement on the
            validation set before terminating training.
        max_num_epochs: Maximum number of epochs to run. If reached, we stop
            training even when the validation loss is still decreasing. Otherwise,
            we train until validation loss increases (see also `stop_after_epochs`).
        clip_max_norm: Value at which to clip the total gradient norm in order to
            prevent exploding gradients. Use None for no clipping.
        exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞`
            during training. Expect errors, silent or explicit, when `False`.
        resume_training: Can be used in case training time is limited, e.g. on a
            cluster. If `True`, the split between train and validation set, the
            optimizer, the number of epochs, and the best validation log-prob will
            be restored from the last time `.train()` was called.
        discard_prior_samples: Whether to discard samples simulated in round 1, i.e.
            from the prior. Training may be sped up by ignoring such less targeted
            samples.
        retrain_from_scratch: Whether to retrain the conditional density
            estimator for the posterior from scratch each round.
        show_train_summary: Whether to print the number of epochs and validation
            loss and leakage after the training.
        dataloader_kwargs: Additional or updated kwargs to be passed to the training
            and validation dataloaders (like, e.g., a collate_fn)

    Returns:
        Classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.
    """
    kwargs = del_entries(locals(), entries=("self", "__class__"))
    kwargs["num_atoms"] = kwargs.pop("num_classes") + 1
    kwargs["loss_kwargs"] = {"gamma": kwargs.pop("gamma")}
    return super().train(**kwargs)

BNRE

Bases: NRE_A

Source code in sbi/inference/trainers/nre/bnre.py
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
class BNRE(NRE_A):
    def __init__(
        self,
        prior: Optional[Distribution] = None,
        classifier: Union[str, Callable] = "resnet",
        device: str = "cpu",
        logging_level: Union[int, str] = "warning",
        summary_writer: Optional[TensorboardSummaryWriter] = None,
        show_progress_bars: bool = True,
    ):
        r"""Balanced neural ratio estimation (BNRE)[1].

        BNRE is a variation of NRE aiming to produce more conservative posterior
        approximations.

        [1] Delaunoy, A., Hermans, J., Rozet, F., Wehenkel, A., & Louppe, G..
        Towards Reliable Simulation-Based Inference with Balanced Neural Ratio
        Estimation.
        NeurIPS 2022. https://arxiv.org/abs/2208.13624

        Args:
            prior: A probability distribution that expresses prior knowledge about the
                parameters, e.g. which ranges are meaningful for them. If `None`, the
                prior must be passed to `.build_posterior()`.
            classifier: Classifier trained to approximate likelihood ratios. If it is
                a string, use a pre-configured network of the provided type (one of
                linear, mlp, resnet). Alternatively, a function that builds a custom
                neural network can be provided. The function will be called with the
                first batch of simulations $(\theta, x)$, which can thus be used for
                shape inference and potentially for z-scoring. It needs to return a
                PyTorch `nn.Module` implementing the classifier.
            device: Training device, e.g., "cpu", "cuda" or "cuda:{0, 1, ...}".
            logging_level: Minimum severity of messages to log. One of the strings
                INFO, WARNING, DEBUG, ERROR and CRITICAL.
            summary_writer: A tensorboard `SummaryWriter` to control, among others, log
                file location (default is `<current working directory>/logs`.)
            show_progress_bars: Whether to show a progressbar during simulation and
                sampling.
        """

        kwargs = del_entries(locals(), entries=("self", "__class__"))
        super().__init__(**kwargs)

    def train(
        self,
        regularization_strength: float = 100.0,
        training_batch_size: int = 200,
        learning_rate: float = 5e-4,
        validation_fraction: float = 0.1,
        stop_after_epochs: int = 20,
        max_num_epochs: int = 2**31 - 1,
        clip_max_norm: Optional[float] = 5.0,
        resume_training: bool = False,
        discard_prior_samples: bool = False,
        retrain_from_scratch: bool = False,
        show_train_summary: bool = False,
        dataloader_kwargs: Optional[Dict] = None,
    ) -> nn.Module:
        r"""Return classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.
        Args:

            regularization_strength: The multiplicative coefficient applied to the
                balancing regularizer ($\lambda$).
            training_batch_size: Training batch size.
            learning_rate: Learning rate for Adam optimizer.
            validation_fraction: The fraction of data to use for validation.
            stop_after_epochs: The number of epochs to wait for improvement on the
                validation set before terminating training.
            max_num_epochs: Maximum number of epochs to run. If reached, we stop
                training even when the validation loss is still decreasing. Otherwise,
                we train until validation loss increases (see also `stop_after_epochs`).
            clip_max_norm: Value at which to clip the total gradient norm in order to
                prevent exploding gradients. Use None for no clipping.
            exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞`
                during training. Expect errors, silent or explicit, when `False`.
            resume_training: Can be used in case training time is limited, e.g. on a
                cluster. If `True`, the split between train and validation set, the
                optimizer, the number of epochs, and the best validation log-prob will
                be restored from the last time `.train()` was called.
            discard_prior_samples: Whether to discard samples simulated in round 1, i.e.
                from the prior. Training may be sped up by ignoring such less targeted
                samples.
            retrain_from_scratch: Whether to retrain the conditional density
                estimator for the posterior from scratch each round.
            show_train_summary: Whether to print the number of epochs and validation
                loss and leakage after the training.
            dataloader_kwargs: Additional or updated kwargs to be passed to the training
                and validation dataloaders (like, e.g., a collate_fn)
        Returns:
            Classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.
        """
        kwargs = del_entries(locals(), entries=("self", "__class__"))
        kwargs["loss_kwargs"] = {
            "regularization_strength": kwargs.pop("regularization_strength")
        }
        return super().train(**kwargs)

    def _loss(
        self, theta: Tensor, x: Tensor, num_atoms: int, regularization_strength: float
    ) -> Tensor:
        """Returns the binary cross-entropy loss for the trained classifier.

        The classifier takes as input a $(\theta,x)$ pair. It is trained to predict 1
        if the pair was sampled from the joint $p(\theta,x)$, and to predict 0 if the
        pair was sampled from the marginals $p(\theta)p(x)$.
        """

        assert theta.shape[0] == x.shape[0], "Batch sizes for theta and x must match."
        batch_size = theta.shape[0]

        logits = self._classifier_logits(theta, x, num_atoms)
        likelihood = torch.sigmoid(logits).squeeze()

        # Alternating pairs where there is one sampled from the joint and one
        # sampled from the marginals. The first element is sampled from the
        # joint p(theta, x) and is labelled 1. The second element is sampled
        # from the marginals p(theta)p(x) and is labelled 0. And so on.
        labels = ones(2 * batch_size, device=self._device)  # two atoms
        labels[1::2] = 0.0

        # Binary cross entropy to learn the likelihood (AALR-specific)
        bce = nn.BCELoss()(likelihood, labels)

        # Balancing regularizer
        regularizer = (
            (torch.sigmoid(logits[0::2]) + torch.sigmoid(logits[1::2]) - 1)
            .mean()
            .square()
        )

        return bce + regularization_strength * regularizer

__init__(prior=None, classifier='resnet', device='cpu', logging_level='warning', summary_writer=None, show_progress_bars=True)

Balanced neural ratio estimation (BNRE)[1].

BNRE is a variation of NRE aiming to produce more conservative posterior approximations.

[1] Delaunoy, A., Hermans, J., Rozet, F., Wehenkel, A., & Louppe, G.. Towards Reliable Simulation-Based Inference with Balanced Neural Ratio Estimation. NeurIPS 2022. https://arxiv.org/abs/2208.13624

Parameters:

Name Type Description Default
prior Optional[Distribution]

A probability distribution that expresses prior knowledge about the parameters, e.g. which ranges are meaningful for them. If None, the prior must be passed to .build_posterior().

None
classifier Union[str, Callable]

Classifier trained to approximate likelihood ratios. If it is a string, use a pre-configured network of the provided type (one of linear, mlp, resnet). Alternatively, a function that builds a custom neural network can be provided. The function will be called with the first batch of simulations \((\theta, x)\), which can thus be used for shape inference and potentially for z-scoring. It needs to return a PyTorch nn.Module implementing the classifier.

'resnet'
device str

Training device, e.g., “cpu”, “cuda” or “cuda:{0, 1, …}”.

'cpu'
logging_level Union[int, str]

Minimum severity of messages to log. One of the strings INFO, WARNING, DEBUG, ERROR and CRITICAL.

'warning'
summary_writer Optional[TensorboardSummaryWriter]

A tensorboard SummaryWriter to control, among others, log file location (default is <current working directory>/logs.)

None
show_progress_bars bool

Whether to show a progressbar during simulation and sampling.

True
Source code in sbi/inference/trainers/nre/bnre.py
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def __init__(
    self,
    prior: Optional[Distribution] = None,
    classifier: Union[str, Callable] = "resnet",
    device: str = "cpu",
    logging_level: Union[int, str] = "warning",
    summary_writer: Optional[TensorboardSummaryWriter] = None,
    show_progress_bars: bool = True,
):
    r"""Balanced neural ratio estimation (BNRE)[1].

    BNRE is a variation of NRE aiming to produce more conservative posterior
    approximations.

    [1] Delaunoy, A., Hermans, J., Rozet, F., Wehenkel, A., & Louppe, G..
    Towards Reliable Simulation-Based Inference with Balanced Neural Ratio
    Estimation.
    NeurIPS 2022. https://arxiv.org/abs/2208.13624

    Args:
        prior: A probability distribution that expresses prior knowledge about the
            parameters, e.g. which ranges are meaningful for them. If `None`, the
            prior must be passed to `.build_posterior()`.
        classifier: Classifier trained to approximate likelihood ratios. If it is
            a string, use a pre-configured network of the provided type (one of
            linear, mlp, resnet). Alternatively, a function that builds a custom
            neural network can be provided. The function will be called with the
            first batch of simulations $(\theta, x)$, which can thus be used for
            shape inference and potentially for z-scoring. It needs to return a
            PyTorch `nn.Module` implementing the classifier.
        device: Training device, e.g., "cpu", "cuda" or "cuda:{0, 1, ...}".
        logging_level: Minimum severity of messages to log. One of the strings
            INFO, WARNING, DEBUG, ERROR and CRITICAL.
        summary_writer: A tensorboard `SummaryWriter` to control, among others, log
            file location (default is `<current working directory>/logs`.)
        show_progress_bars: Whether to show a progressbar during simulation and
            sampling.
    """

    kwargs = del_entries(locals(), entries=("self", "__class__"))
    super().__init__(**kwargs)

train(regularization_strength=100.0, training_batch_size=200, learning_rate=0.0005, validation_fraction=0.1, stop_after_epochs=20, max_num_epochs=2 ** 31 - 1, clip_max_norm=5.0, resume_training=False, discard_prior_samples=False, retrain_from_scratch=False, show_train_summary=False, dataloader_kwargs=None)

Return classifier that approximates the ratio \(p(\theta,x)/p(\theta)p(x)\). Args:

regularization_strength: The multiplicative coefficient applied to the
    balancing regularizer ($\lambda$).
training_batch_size: Training batch size.
learning_rate: Learning rate for Adam optimizer.
validation_fraction: The fraction of data to use for validation.
stop_after_epochs: The number of epochs to wait for improvement on the
    validation set before terminating training.
max_num_epochs: Maximum number of epochs to run. If reached, we stop
    training even when the validation loss is still decreasing. Otherwise,
    we train until validation loss increases (see also `stop_after_epochs`).
clip_max_norm: Value at which to clip the total gradient norm in order to
    prevent exploding gradients. Use None for no clipping.
exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞`
    during training. Expect errors, silent or explicit, when `False`.
resume_training: Can be used in case training time is limited, e.g. on a
    cluster. If `True`, the split between train and validation set, the
    optimizer, the number of epochs, and the best validation log-prob will
    be restored from the last time `.train()` was called.
discard_prior_samples: Whether to discard samples simulated in round 1, i.e.
    from the prior. Training may be sped up by ignoring such less targeted
    samples.
retrain_from_scratch: Whether to retrain the conditional density
    estimator for the posterior from scratch each round.
show_train_summary: Whether to print the number of epochs and validation
    loss and leakage after the training.
dataloader_kwargs: Additional or updated kwargs to be passed to the training
    and validation dataloaders (like, e.g., a collate_fn)

Returns: Classifier that approximates the ratio \(p(\theta,x)/p(\theta)p(x)\).

Source code in sbi/inference/trainers/nre/bnre.py
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
def train(
    self,
    regularization_strength: float = 100.0,
    training_batch_size: int = 200,
    learning_rate: float = 5e-4,
    validation_fraction: float = 0.1,
    stop_after_epochs: int = 20,
    max_num_epochs: int = 2**31 - 1,
    clip_max_norm: Optional[float] = 5.0,
    resume_training: bool = False,
    discard_prior_samples: bool = False,
    retrain_from_scratch: bool = False,
    show_train_summary: bool = False,
    dataloader_kwargs: Optional[Dict] = None,
) -> nn.Module:
    r"""Return classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.
    Args:

        regularization_strength: The multiplicative coefficient applied to the
            balancing regularizer ($\lambda$).
        training_batch_size: Training batch size.
        learning_rate: Learning rate for Adam optimizer.
        validation_fraction: The fraction of data to use for validation.
        stop_after_epochs: The number of epochs to wait for improvement on the
            validation set before terminating training.
        max_num_epochs: Maximum number of epochs to run. If reached, we stop
            training even when the validation loss is still decreasing. Otherwise,
            we train until validation loss increases (see also `stop_after_epochs`).
        clip_max_norm: Value at which to clip the total gradient norm in order to
            prevent exploding gradients. Use None for no clipping.
        exclude_invalid_x: Whether to exclude simulation outputs `x=NaN` or `x=±∞`
            during training. Expect errors, silent or explicit, when `False`.
        resume_training: Can be used in case training time is limited, e.g. on a
            cluster. If `True`, the split between train and validation set, the
            optimizer, the number of epochs, and the best validation log-prob will
            be restored from the last time `.train()` was called.
        discard_prior_samples: Whether to discard samples simulated in round 1, i.e.
            from the prior. Training may be sped up by ignoring such less targeted
            samples.
        retrain_from_scratch: Whether to retrain the conditional density
            estimator for the posterior from scratch each round.
        show_train_summary: Whether to print the number of epochs and validation
            loss and leakage after the training.
        dataloader_kwargs: Additional or updated kwargs to be passed to the training
            and validation dataloaders (like, e.g., a collate_fn)
    Returns:
        Classifier that approximates the ratio $p(\theta,x)/p(\theta)p(x)$.
    """
    kwargs = del_entries(locals(), entries=("self", "__class__"))
    kwargs["loss_kwargs"] = {
        "regularization_strength": kwargs.pop("regularization_strength")
    }
    return super().train(**kwargs)

MCABC

Bases: ABCBASE

Monte-Carlo Approximate Bayesian Computation (Rejection ABC).

Source code in sbi/inference/abc/mcabc.py
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
class MCABC(ABCBASE):
    """Monte-Carlo Approximate Bayesian Computation (Rejection ABC)."""

    def __init__(
        self,
        simulator: Callable,
        prior,
        distance: Union[str, Callable] = "l2",
        requires_iid_data: Optional[None] = None,
        distance_kwargs: Optional[Dict] = None,
        num_workers: int = 1,
        simulation_batch_size: int = 1,
        distance_batch_size: int = -1,
        show_progress_bars: bool = True,
    ):
        r"""Monte-Carlo Approximate Bayesian Computation (Rejection ABC) [1].

        [1] Pritchard, J. K., Seielstad, M. T., Perez-Lezaun, A., & Feldman, M. W.
        (1999). Population growth of human Y chromosomes: a study of Y chromosome
        microsatellites. Molecular biology and evolution, 16(12), 1791-1798.

        Args:
            simulator: A function that takes parameters $\theta$ and maps them to
                simulations, or observations, `x`, $\mathrm{sim}(\theta)\to x$. Any
                regular Python callable (i.e. function or class with `__call__` method)
                can be used.
            prior: A probability distribution that expresses prior knowledge about the
                parameters, e.g. which ranges are meaningful for them. Any
                object with `.log_prob()`and `.sample()` (for example, a PyTorch
                distribution) can be used.
            distance: Distance function to compare observed and simulated data. Can be
                a custom callable function or one of `l1`, `l2`, `mse`,
                `mmd`, `wasserstein`.
            requires_iid_data: Whether to allow conditioning on iid sampled data or not.
                Typically, this information is inferred by the choice of the distance,
                but in case a custom distance is used, this information is pivotal.
            distance_kwargs: Configurations parameters for the distances. In particular
                useful for the MMD and Wasserstein distance.
            num_workers: Number of parallel workers to use for simulations.
            simulation_batch_size: Number of parameter sets that the simulator
                maps to data x at once. If None, we simulate all parameter sets at the
                same time. If >= 1, the simulator has to process data of shape
                (simulation_batch_size, parameter_dimension).
            distance_batch_size: Number of simulations that the distance function
                evaluates against the reference observations at once. If -1, we evaluate
                all simulations at the same time.
        """

        super().__init__(
            simulator=simulator,
            prior=prior,
            distance=distance,
            requires_iid_data=requires_iid_data,
            distance_kwargs=distance_kwargs,
            num_workers=num_workers,
            simulation_batch_size=simulation_batch_size,
            distance_batch_size=distance_batch_size,
            show_progress_bars=show_progress_bars,
        )

    def __call__(
        self,
        x_o: Union[Tensor, ndarray],
        num_simulations: int,
        eps: Optional[float] = None,
        quantile: Optional[float] = None,
        lra: bool = False,
        sass: bool = False,
        sass_fraction: float = 0.25,
        sass_expansion_degree: int = 1,
        kde: bool = False,
        kde_kwargs: Optional[Dict[str, Any]] = None,
        return_summary: bool = False,
        num_iid_samples: int = 1,
    ) -> Union[Tuple[Tensor, dict], Tuple[KDEWrapper, dict], Tensor, KDEWrapper]:
        r"""Run MCABC and return accepted parameters or KDE object fitted on them.

        Args:
            x_o: Observed data.
            num_simulations: Number of simulations to run.
            eps: Acceptance threshold $\epsilon$ for distance between observed and
                simulated data.
            quantile: Upper quantile of smallest distances for which the corresponding
                parameters are returned, e.g, q=0.01 will return the top 1%. Exactly
                one of quantile or `eps` have to be passed.
            lra: Whether to run linear regression adjustment as in Beaumont et al. 2002
            sass: Whether to determine semi-automatic summary statistics as in
                Fearnhead & Prangle 2012.
            sass_fraction: Fraction of simulation budget used for the initial sass run.
            sass_expansion_degree: Degree of the polynomial feature expansion for the
                sass regression, default 1 - no expansion.
            kde: Whether to run KDE on the accepted parameters to return a KDE
                object from which one can sample.
            kde_kwargs: kwargs for performing KDE:
                'bandwidth='; either a float, or a string naming a bandwidth
                heuristics, e.g., 'cv' (cross validation), 'silvermann' or 'scott',
                default 'cv'.
                'transform': transform applied to the parameters before doing KDE.
                'sample_weights': weights associated with samples. See 'get_kde' for
                more details
            return_summary: Whether to return the distances and data corresponding to
                the accepted parameters.
            num_iid_samples: Number of simulations per parameter. Choose
                `num_iid_samples>1`, if you have chosen a statistical distance that
                evaluates sets of simulations against a set of reference observations
                instead of a single data-point comparison.

        Returns:
            theta (if kde False): accepted parameters
            kde (if kde True): KDE object based on accepted parameters from which one
                can .sample() and .log_prob().
            summary (if summary True): dictionary containing the accepted paramters (if
                kde True), distances and simulated data x.
        """

        # Exactly one of eps or quantile need to be passed.
        assert (eps is not None) ^ (
            quantile is not None
        ), "Eps or quantile must be passed, but not both."
        if kde_kwargs is None:
            kde_kwargs = {}

        # Run SASS and change the simulator and x_o accordingly.
        if sass:
            num_pilot_simulations = int(sass_fraction * num_simulations)
            self.logger.info(
                "Running SASS with %s pilot samples.", num_pilot_simulations
            )
            num_simulations -= num_pilot_simulations

            pilot_theta = self.prior.sample((num_pilot_simulations,))
            pilot_x = self._batched_simulator(pilot_theta)

            sass_transform = self.get_sass_transform(
                pilot_theta, pilot_x, sass_expansion_degree
            )

            # Add sass transform to simulator and x_o.
            def simulator(theta):
                return sass_transform(self._batched_simulator(theta))

            x_o = sass_transform(x_o)
        else:
            simulator = self._batched_simulator

        # Simulate and calculate distances.
        theta = self.prior.sample((num_simulations,))
        theta_repeat = theta.repeat_interleave(num_iid_samples, dim=0)
        x = simulator(theta_repeat)
        x = x.reshape((
            num_simulations,
            num_iid_samples,
            -1,
        ))  # Dim(num_initial_pop, num_iid_samples, -1)

        # Infer x shape to test and set x_o.
        if not self.distance.requires_iid_data:
            x = x.squeeze(1)
            self.x_shape = x[0].shape
            self.x_o = process_x(x_o, self.x_shape)
        else:
            self.x_shape = x[0, 0].shape
            self.x_o = process_x(x_o, self.x_shape)

        distances = self.distance(self.x_o, x)

        # Select based on acceptance threshold epsilon.
        if eps is not None:
            is_accepted = distances < eps
            num_accepted = is_accepted.sum().item()
            assert num_accepted > 0, f"No parameters accepted, eps={eps} too small"

            theta_accepted = theta[is_accepted]
            distances_accepted = distances[is_accepted]
            x_accepted = x[is_accepted]

        # Select based on quantile on sorted distances.
        elif quantile is not None:
            num_top_samples = int(num_simulations * quantile)
            sort_idx = torch.argsort(distances)
            theta_accepted = theta[sort_idx][:num_top_samples]
            distances_accepted = distances[sort_idx][:num_top_samples]
            x_accepted = x[sort_idx][:num_top_samples]

        else:
            raise ValueError("One of epsilon or quantile has to be passed.")

        # Maybe adjust theta with LRA.
        if lra:
            self.logger.info("Running Linear regression adjustment.")
            final_theta = self.run_lra(theta_accepted, x_accepted, observation=self.x_o)
        else:
            final_theta = theta_accepted

        if kde:
            self.logger.info(
                """KDE on %s samples with bandwidth option
                {kde_kwargs["bandwidth"] if "bandwidth" in kde_kwargs else "cv"}.
                Beware that KDE can give unreliable results when used with too few
                samples and in high dimensions.""",
                final_theta.shape[0],
            )

            kde_dist = get_kde(final_theta, **kde_kwargs)

            if return_summary:
                return (
                    kde_dist,
                    dict(theta=final_theta, distances=distances_accepted, x=x_accepted),
                )
            else:
                return kde_dist
        elif return_summary:
            return final_theta, dict(distances=distances_accepted, x=x_accepted)
        else:
            return final_theta

__call__(x_o, num_simulations, eps=None, quantile=None, lra=False, sass=False, sass_fraction=0.25, sass_expansion_degree=1, kde=False, kde_kwargs=None, return_summary=False, num_iid_samples=1)

Run MCABC and return accepted parameters or KDE object fitted on them.

Parameters:

Name Type Description Default
x_o Union[Tensor, ndarray]

Observed data.

required
num_simulations int

Number of simulations to run.

required
eps Optional[float]

Acceptance threshold \(\epsilon\) for distance between observed and simulated data.

None
quantile Optional[float]

Upper quantile of smallest distances for which the corresponding parameters are returned, e.g, q=0.01 will return the top 1%. Exactly one of quantile or eps have to be passed.

None
lra bool

Whether to run linear regression adjustment as in Beaumont et al. 2002

False
sass bool

Whether to determine semi-automatic summary statistics as in Fearnhead & Prangle 2012.

False
sass_fraction float

Fraction of simulation budget used for the initial sass run.

0.25
sass_expansion_degree int

Degree of the polynomial feature expansion for the sass regression, default 1 - no expansion.

1
kde bool

Whether to run KDE on the accepted parameters to return a KDE object from which one can sample.

False
kde_kwargs Optional[Dict[str, Any]]

kwargs for performing KDE: ‘bandwidth=’; either a float, or a string naming a bandwidth heuristics, e.g., ‘cv’ (cross validation), ‘silvermann’ or ‘scott’, default ‘cv’. ‘transform’: transform applied to the parameters before doing KDE. ‘sample_weights’: weights associated with samples. See ‘get_kde’ for more details

None
return_summary bool

Whether to return the distances and data corresponding to the accepted parameters.

False
num_iid_samples int

Number of simulations per parameter. Choose num_iid_samples>1, if you have chosen a statistical distance that evaluates sets of simulations against a set of reference observations instead of a single data-point comparison.

1

Returns:

Name Type Description
theta if kde False

accepted parameters

kde if kde True

KDE object based on accepted parameters from which one can .sample() and .log_prob().

summary if summary True

dictionary containing the accepted paramters (if kde True), distances and simulated data x.

Source code in sbi/inference/abc/mcabc.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
def __call__(
    self,
    x_o: Union[Tensor, ndarray],
    num_simulations: int,
    eps: Optional[float] = None,
    quantile: Optional[float] = None,
    lra: bool = False,
    sass: bool = False,
    sass_fraction: float = 0.25,
    sass_expansion_degree: int = 1,
    kde: bool = False,
    kde_kwargs: Optional[Dict[str, Any]] = None,
    return_summary: bool = False,
    num_iid_samples: int = 1,
) -> Union[Tuple[Tensor, dict], Tuple[KDEWrapper, dict], Tensor, KDEWrapper]:
    r"""Run MCABC and return accepted parameters or KDE object fitted on them.

    Args:
        x_o: Observed data.
        num_simulations: Number of simulations to run.
        eps: Acceptance threshold $\epsilon$ for distance between observed and
            simulated data.
        quantile: Upper quantile of smallest distances for which the corresponding
            parameters are returned, e.g, q=0.01 will return the top 1%. Exactly
            one of quantile or `eps` have to be passed.
        lra: Whether to run linear regression adjustment as in Beaumont et al. 2002
        sass: Whether to determine semi-automatic summary statistics as in
            Fearnhead & Prangle 2012.
        sass_fraction: Fraction of simulation budget used for the initial sass run.
        sass_expansion_degree: Degree of the polynomial feature expansion for the
            sass regression, default 1 - no expansion.
        kde: Whether to run KDE on the accepted parameters to return a KDE
            object from which one can sample.
        kde_kwargs: kwargs for performing KDE:
            'bandwidth='; either a float, or a string naming a bandwidth
            heuristics, e.g., 'cv' (cross validation), 'silvermann' or 'scott',
            default 'cv'.
            'transform': transform applied to the parameters before doing KDE.
            'sample_weights': weights associated with samples. See 'get_kde' for
            more details
        return_summary: Whether to return the distances and data corresponding to
            the accepted parameters.
        num_iid_samples: Number of simulations per parameter. Choose
            `num_iid_samples>1`, if you have chosen a statistical distance that
            evaluates sets of simulations against a set of reference observations
            instead of a single data-point comparison.

    Returns:
        theta (if kde False): accepted parameters
        kde (if kde True): KDE object based on accepted parameters from which one
            can .sample() and .log_prob().
        summary (if summary True): dictionary containing the accepted paramters (if
            kde True), distances and simulated data x.
    """

    # Exactly one of eps or quantile need to be passed.
    assert (eps is not None) ^ (
        quantile is not None
    ), "Eps or quantile must be passed, but not both."
    if kde_kwargs is None:
        kde_kwargs = {}

    # Run SASS and change the simulator and x_o accordingly.
    if sass:
        num_pilot_simulations = int(sass_fraction * num_simulations)
        self.logger.info(
            "Running SASS with %s pilot samples.", num_pilot_simulations
        )
        num_simulations -= num_pilot_simulations

        pilot_theta = self.prior.sample((num_pilot_simulations,))
        pilot_x = self._batched_simulator(pilot_theta)

        sass_transform = self.get_sass_transform(
            pilot_theta, pilot_x, sass_expansion_degree
        )

        # Add sass transform to simulator and x_o.
        def simulator(theta):
            return sass_transform(self._batched_simulator(theta))

        x_o = sass_transform(x_o)
    else:
        simulator = self._batched_simulator

    # Simulate and calculate distances.
    theta = self.prior.sample((num_simulations,))
    theta_repeat = theta.repeat_interleave(num_iid_samples, dim=0)
    x = simulator(theta_repeat)
    x = x.reshape((
        num_simulations,
        num_iid_samples,
        -1,
    ))  # Dim(num_initial_pop, num_iid_samples, -1)

    # Infer x shape to test and set x_o.
    if not self.distance.requires_iid_data:
        x = x.squeeze(1)
        self.x_shape = x[0].shape
        self.x_o = process_x(x_o, self.x_shape)
    else:
        self.x_shape = x[0, 0].shape
        self.x_o = process_x(x_o, self.x_shape)

    distances = self.distance(self.x_o, x)

    # Select based on acceptance threshold epsilon.
    if eps is not None:
        is_accepted = distances < eps
        num_accepted = is_accepted.sum().item()
        assert num_accepted > 0, f"No parameters accepted, eps={eps} too small"

        theta_accepted = theta[is_accepted]
        distances_accepted = distances[is_accepted]
        x_accepted = x[is_accepted]

    # Select based on quantile on sorted distances.
    elif quantile is not None:
        num_top_samples = int(num_simulations * quantile)
        sort_idx = torch.argsort(distances)
        theta_accepted = theta[sort_idx][:num_top_samples]
        distances_accepted = distances[sort_idx][:num_top_samples]
        x_accepted = x[sort_idx][:num_top_samples]

    else:
        raise ValueError("One of epsilon or quantile has to be passed.")

    # Maybe adjust theta with LRA.
    if lra:
        self.logger.info("Running Linear regression adjustment.")
        final_theta = self.run_lra(theta_accepted, x_accepted, observation=self.x_o)
    else:
        final_theta = theta_accepted

    if kde:
        self.logger.info(
            """KDE on %s samples with bandwidth option
            {kde_kwargs["bandwidth"] if "bandwidth" in kde_kwargs else "cv"}.
            Beware that KDE can give unreliable results when used with too few
            samples and in high dimensions.""",
            final_theta.shape[0],
        )

        kde_dist = get_kde(final_theta, **kde_kwargs)

        if return_summary:
            return (
                kde_dist,
                dict(theta=final_theta, distances=distances_accepted, x=x_accepted),
            )
        else:
            return kde_dist
    elif return_summary:
        return final_theta, dict(distances=distances_accepted, x=x_accepted)
    else:
        return final_theta

__init__(simulator, prior, distance='l2', requires_iid_data=None, distance_kwargs=None, num_workers=1, simulation_batch_size=1, distance_batch_size=-1, show_progress_bars=True)

Monte-Carlo Approximate Bayesian Computation (Rejection ABC) [1].

[1] Pritchard, J. K., Seielstad, M. T., Perez-Lezaun, A., & Feldman, M. W. (1999). Population growth of human Y chromosomes: a study of Y chromosome microsatellites. Molecular biology and evolution, 16(12), 1791-1798.

Parameters:

Name Type Description Default
simulator Callable

A function that takes parameters \(\theta\) and maps them to simulations, or observations, x, \(\mathrm{sim}(\theta)\to x\). Any regular Python callable (i.e. function or class with __call__ method) can be used.

required
prior

A probability distribution that expresses prior knowledge about the parameters, e.g. which ranges are meaningful for them. Any object with .log_prob()and .sample() (for example, a PyTorch distribution) can be used.

required
distance Union[str, Callable]

Distance function to compare observed and simulated data. Can be a custom callable function or one of l1, l2, mse, mmd, wasserstein.

'l2'
requires_iid_data Optional[None]

Whether to allow conditioning on iid sampled data or not. Typically, this information is inferred by the choice of the distance, but in case a custom distance is used, this information is pivotal.

None
distance_kwargs Optional[Dict]

Configurations parameters for the distances. In particular useful for the MMD and Wasserstein distance.

None
num_workers int

Number of parallel workers to use for simulations.

1
simulation_batch_size int

Number of parameter sets that the simulator maps to data x at once. If None, we simulate all parameter sets at the same time. If >= 1, the simulator has to process data of shape (simulation_batch_size, parameter_dimension).

1
distance_batch_size int

Number of simulations that the distance function evaluates against the reference observations at once. If -1, we evaluate all simulations at the same time.

-1
Source code in sbi/inference/abc/mcabc.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
def __init__(
    self,
    simulator: Callable,
    prior,
    distance: Union[str, Callable] = "l2",
    requires_iid_data: Optional[None] = None,
    distance_kwargs: Optional[Dict] = None,
    num_workers: int = 1,
    simulation_batch_size: int = 1,
    distance_batch_size: int = -1,
    show_progress_bars: bool = True,
):
    r"""Monte-Carlo Approximate Bayesian Computation (Rejection ABC) [1].

    [1] Pritchard, J. K., Seielstad, M. T., Perez-Lezaun, A., & Feldman, M. W.
    (1999). Population growth of human Y chromosomes: a study of Y chromosome
    microsatellites. Molecular biology and evolution, 16(12), 1791-1798.

    Args:
        simulator: A function that takes parameters $\theta$ and maps them to
            simulations, or observations, `x`, $\mathrm{sim}(\theta)\to x$. Any
            regular Python callable (i.e. function or class with `__call__` method)
            can be used.
        prior: A probability distribution that expresses prior knowledge about the
            parameters, e.g. which ranges are meaningful for them. Any
            object with `.log_prob()`and `.sample()` (for example, a PyTorch
            distribution) can be used.
        distance: Distance function to compare observed and simulated data. Can be
            a custom callable function or one of `l1`, `l2`, `mse`,
            `mmd`, `wasserstein`.
        requires_iid_data: Whether to allow conditioning on iid sampled data or not.
            Typically, this information is inferred by the choice of the distance,
            but in case a custom distance is used, this information is pivotal.
        distance_kwargs: Configurations parameters for the distances. In particular
            useful for the MMD and Wasserstein distance.
        num_workers: Number of parallel workers to use for simulations.
        simulation_batch_size: Number of parameter sets that the simulator
            maps to data x at once. If None, we simulate all parameter sets at the
            same time. If >= 1, the simulator has to process data of shape
            (simulation_batch_size, parameter_dimension).
        distance_batch_size: Number of simulations that the distance function
            evaluates against the reference observations at once. If -1, we evaluate
            all simulations at the same time.
    """

    super().__init__(
        simulator=simulator,
        prior=prior,
        distance=distance,
        requires_iid_data=requires_iid_data,
        distance_kwargs=distance_kwargs,
        num_workers=num_workers,
        simulation_batch_size=simulation_batch_size,
        distance_batch_size=distance_batch_size,
        show_progress_bars=show_progress_bars,
    )

SMCABC

Bases: ABCBASE

Sequential Monte Carlo Approximate Bayesian Computation.

Source code in sbi/inference/abc/smcabc.py
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
class SMCABC(ABCBASE):
    """Sequential Monte Carlo Approximate Bayesian Computation."""

    def __init__(
        self,
        simulator: Callable,
        prior: Distribution,
        distance: Union[str, Callable] = "l2",
        requires_iid_data: Optional[None] = None,
        distance_kwargs: Optional[Dict] = None,
        num_workers: int = 1,
        simulation_batch_size: int = 1,
        distance_batch_size: int = -1,
        show_progress_bars: bool = True,
        kernel: Optional[str] = "gaussian",
        algorithm_variant: str = "C",
    ):
        r"""Sequential Monte Carlo Approximate Bayesian Computation.

        We distinguish between three different SMC methods here:
            - A: Toni et al. 2010 (Phd Thesis)
            - B: Sisson et al. 2007 (with correction from 2009)
            - C: Beaumont et al. 2009

        In Toni et al. 2010 we find an overview of the differences on page 34:
            - B: same as A except for resampling of weights if the effective sampling
                size is too small.
            - C: same as A except for calculation of the covariance of the perturbation
                kernel: the kernel covariance is a scaled version of the covariance of
                the previous population.

        Args:
            simulator: A function that takes parameters $\theta$ and maps them to
                simulations, or observations, `x`, $\mathrm{sim}(\theta)\to x$. Any
                regular Python callable (i.e. function or class with `__call__` method)
                can be used.
            prior: A probability distribution that expresses prior knowledge about the
                parameters, e.g. which ranges are meaningful for them. Any
                object with `.log_prob()`and `.sample()` (for example, a PyTorch
                distribution) can be used.
            distance: Distance function to compare observed and simulated data. Can be
                a custom callable function or one of `l1`, `l2`, `mse`,
                `mmd`, `wasserstein`.
            requires_iid_data: Whether to allow conditioning on iid sampled data or not.
                Typically, this information is inferred by the choice of the distance,
                but in case a custom distance is used, this information is pivotal.
            distance_kwargs: Configurations parameters for the distances. In particular
                useful for the MMD and Wasserstein distance.
            num_workers: Number of parallel workers to use for simulations.
            simulation_batch_size: Number of parameter sets that the simulator
                maps to data x at once. If None, we simulate all parameter sets at the
                same time. If >= 1, the simulator has to process data of shape
                (simulation_batch_size, parameter_dimension).
            distance_batch_size: Number of simulations that the distance function
                evaluates against the reference observations at once. If -1, we evaluate
                all simulations at the same time.
            show_progress_bars: Whether to show a progressbar during simulation and
                sampling.
            kernel: Perturbation kernel.
            algorithm_variant: Indicating the choice of algorithm variant, A, B, or C.
        """

        super().__init__(
            simulator=simulator,
            prior=prior,
            distance=distance,
            requires_iid_data=requires_iid_data,
            distance_kwargs=distance_kwargs,
            num_workers=num_workers,
            simulation_batch_size=simulation_batch_size,
            distance_batch_size=distance_batch_size,
            show_progress_bars=show_progress_bars,
        )

        kernels = ("gaussian", "uniform")
        assert (
            kernel in kernels
        ), f"Kernel '{kernel}' not supported. Choose one from {kernels}."
        self.kernel = kernel

        algorithm_variants = ("A", "B", "C")
        assert algorithm_variant in algorithm_variants, (
            f"SMCABC variant '{algorithm_variant}' not supported, choose one from"
            " {algorithm_variants}."
        )
        self.algorithm_variant = algorithm_variant
        self.distance_to_x0 = None
        self.simulation_counter = 0
        self.num_simulations = 0
        self.kernel_variance = None

        # Define simulator that keeps track of budget.
        def simulate_with_budget(theta):
            self.simulation_counter += theta.shape[0]
            return self._batched_simulator(theta)

        self._simulate_with_budget = simulate_with_budget

    def __call__(
        self,
        x_o: Union[Tensor, ndarray],
        num_particles: int,
        num_initial_pop: int,
        num_simulations: int,
        epsilon_decay: float,
        distance_based_decay: bool = False,
        ess_min: Optional[float] = None,
        kernel_variance_scale: float = 1.0,
        use_last_pop_samples: bool = True,
        return_summary: bool = False,
        kde: bool = False,
        kde_kwargs: Optional[Dict[str, Any]] = None,
        kde_sample_weights: bool = False,
        lra: bool = False,
        lra_with_weights: bool = False,
        sass: bool = False,
        sass_fraction: float = 0.25,
        sass_expansion_degree: int = 1,
        num_iid_samples: int = 1,
    ) -> Union[Tensor, KDEWrapper, Tuple[Tensor, dict], Tuple[KDEWrapper, dict]]:
        r"""Run SMCABC and return accepted parameters or KDE object fitted on them.

        Args:
            x_o: Observed data.
            num_particles: Number of particles in each population.
            num_initial_pop: Number of simulations used for initial population.
            num_simulations: Total number of possible simulations.
            epsilon_decay: Factor with which the acceptance threshold $\epsilon$ decays.
            distance_based_decay: Whether the $\epsilon$ decay is constant over
                populations or calculated from the previous populations distribution of
                distances.
            ess_min: Threshold of effective sampling size for resampling weights. Not
                used when None (default).
            kernel_variance_scale: Factor for scaling the perturbation kernel variance.
            use_last_pop_samples: Whether to fill up the current population with
                samples from the previous population when the budget is used up. If
                False, the current population is discarded and the previous population
                is returned.
            lra: Whether to run linear regression adjustment as in Beaumont et al. 2002
            lra_with_weights: Whether to run lra as weighted linear regression with SMC
                weights
            sass: Whether to determine semi-automatic summary statistics (sass) as in
                Fearnhead & Prangle 2012.
            sass_fraction: Fraction of simulation budget used for the initial sass run.
            sass_expansion_degree: Degree of the polynomial feature expansion for the
                sass regression, default 1 - no expansion.
            kde: Whether to run KDE on the accepted parameters to return a KDE
                object from which one can sample.
            kde_kwargs: kwargs for performing KDE:
                'bandwidth='; either a float, or a string naming a bandwidth
                heuristics, e.g., 'cv' (cross validation), 'silvermann' or 'scott',
                default 'cv'.
                'transform': transform applied to the parameters before doing KDE.
                'sample_weights': weights associated with samples. See 'get_kde' for
                more details
            kde_sample_weights: Whether perform weighted KDE with SMC weights or on raw
                particles.
            return_summary: Whether to return a dictionary with all accepted particles,
                weights, etc. at the end.
            num_iid_samples: Number of simulations per parameter. Choose
                `num_iid_samples>1`, if you have chosen a statistical distance that
                evaluates sets of simulations against a set of reference observations
                instead of a single data-point comparison.

        Returns:
            theta (if kde False): accepted parameters of the last population.
            kde (if kde True): KDE object fitted on accepted parameters, from which one
                can .sample() and .log_prob().
            summary (if return_summary True): dictionary containing the accepted
                paramters (if kde True), distances and simulated data x of all
                populations.
        """

        pop_idx = 0
        self.num_simulations = num_simulations * num_iid_samples
        if kde_kwargs is None:
            kde_kwargs = {}
        assert isinstance(epsilon_decay, float) and epsilon_decay > 0.0
        assert not (
            self.distance.requires_iid_data and lra
        ), "Currently there is no support to run inference "
        "on multiple observations together with lra."
        assert not (
            self.distance.requires_iid_data and sass
        ), "Currently there is no support to run inference "
        "on multiple observations together with sass."

        # Pilot run for SASS.
        if sass:
            num_pilot_simulations = int(sass_fraction * num_simulations)
            self.logger.info(
                "Running SASS with %s pilot samples.", num_pilot_simulations
            )
            sass_transform = self.run_sass_set_xo(
                num_particles,
                num_pilot_simulations,
                x_o,
                num_iid_samples,
                lra,
                sass_expansion_degree,
            )
            # Udpate simulator and xo
            x_o = sass_transform(self.x_o)

            def sass_simulator(theta):
                self.simulation_counter += theta.shape[0]
                return sass_transform(self._batched_simulator(theta))

            self._simulate_with_budget = sass_simulator

        # run initial population
        particles, epsilon, distances, x = self._set_xo_and_sample_initial_population(
            x_o, num_particles, num_initial_pop, num_iid_samples
        )
        log_weights = torch.log(1 / num_particles * torch.ones(num_particles))

        self.logger.info((
            "population=%s, eps=%s, ess=%s, num_sims=%s",
            pop_idx,
            epsilon,
            1.0,
            num_initial_pop,
        ))

        all_particles = [particles]
        all_log_weights = [log_weights]
        all_distances = [distances]
        all_epsilons = [epsilon]
        all_x = [x]

        while self.simulation_counter < self.num_simulations:
            pop_idx += 1
            # Decay based on quantile of distances from previous pop.
            if distance_based_decay:
                epsilon = self._get_next_epsilon(
                    all_distances[pop_idx - 1], epsilon_decay
                )
            # Constant decay.
            else:
                epsilon *= epsilon_decay

            # Get kernel variance from previous pop.
            self.kernel_variance = self.get_kernel_variance(
                all_particles[pop_idx - 1],
                torch.exp(all_log_weights[pop_idx - 1]),
                samples_per_dim=500,
                kernel_variance_scale=kernel_variance_scale,
            )
            particles, log_weights, distances, x = self._sample_next_population(
                particles=all_particles[pop_idx - 1],
                log_weights=all_log_weights[pop_idx - 1],
                distances=all_distances[pop_idx - 1],
                epsilon=epsilon,
                x=all_x[pop_idx - 1],
                num_iid_samples=num_iid_samples,
                use_last_pop_samples=use_last_pop_samples,
            )

            # Resample population if effective sampling size is too small.
            if ess_min is not None:
                particles, log_weights = self.resample_if_ess_too_small(
                    particles, log_weights, ess_min, pop_idx
                )

            self.logger.info((
                "population=%s done: eps={epsilon:.6f}, num_sims=%s.",
                pop_idx,
                epsilon,
                self.simulation_counter,
            ))

            # collect results
            all_particles.append(particles)
            all_log_weights.append(log_weights)
            all_distances.append(distances)
            all_epsilons.append(epsilon)
            all_x.append(x)

        # Maybe run LRA and adjust weights.
        if lra:
            self.logger.info("Running Linear regression adjustment.")
            adjusted_particles, _ = self.run_lra_update_weights(
                particles=all_particles[-1],
                xs=all_x[-1],
                observation=process_x(x_o),
                log_weights=all_log_weights[-1],
                lra_with_weights=lra_with_weights,
            )
            final_particles = adjusted_particles
        else:
            final_particles = all_particles[-1]

        if kde:
            self.logger.info(
                """KDE on %s samples with bandwidth option %s. Beware that KDE can give
                unreliable results when used with too few samples and in high
                dimensions.""",
                final_particles.shape[0],
                kde_kwargs.get("bandwidth", "cv"),
            )
            # Maybe get particles weights from last population for weighted KDE.
            if kde_sample_weights:
                kde_kwargs["sample_weights"] = all_log_weights[-1].exp()

            kde_dist = get_kde(final_particles, **kde_kwargs)

            if return_summary:
                return (
                    kde_dist,
                    dict(
                        particles=all_particles,
                        weights=all_log_weights,
                        epsilons=all_epsilons,
                        distances=all_distances,
                        xs=all_x,
                    ),
                )
            else:
                return kde_dist

        if return_summary:
            return (
                final_particles,
                dict(
                    particles=all_particles,
                    weights=all_log_weights,
                    epsilons=all_epsilons,
                    distances=all_distances,
                    xs=all_x,
                ),
            )
        else:
            return final_particles

    def _set_xo_and_sample_initial_population(
        self,
        x_o: Array,
        num_particles: int,
        num_initial_pop: int,
        num_iid_samples: int,
    ) -> Tuple[Tensor, float, Tensor, Tensor]:
        """Return particles, epsilon and distances of initial population."""

        assert (
            num_particles <= num_initial_pop
        ), "number of initial round simulations must be greater than population size"

        assert (x_o.shape[0] == 1) or self.distance.requires_iid_data, (
            "Your data contain iid data-points, but the choice of "
            "your distance does not allow multiple conditioning "
            "observations."
        )

        theta = self.prior.sample((num_initial_pop,))

        theta_repeat = theta.repeat_interleave(num_iid_samples, dim=0)
        x = self._simulate_with_budget(theta_repeat)
        x = x.reshape((
            num_initial_pop,
            num_iid_samples,
            -1,
        ))  # Dim(num_initial_pop, num_iid_samples, -1)

        # Infer x shape to test and set x_o.
        if not self.distance.requires_iid_data:
            x = x.squeeze(1)
            self.x_shape = x[0].shape
        else:
            self.x_shape = x[0, 0].shape
        self.x_o = process_x(x_o, self.x_shape)

        distances = self.distance(self.x_o, x)
        sortidx = torch.argsort(distances)
        particles = theta[sortidx][:num_particles]
        # Take last accepted distance as epsilon.
        initial_epsilon = distances[sortidx][num_particles - 1].item()

        if not math.isfinite(initial_epsilon):
            initial_epsilon = 1e8

        return (
            particles,
            initial_epsilon,
            distances[sortidx][:num_particles],
            x[sortidx][:num_particles],
        )

    def _sample_next_population(
        self,
        particles: Tensor,
        log_weights: Tensor,
        distances: Tensor,
        epsilon: float,
        x: Tensor,
        num_iid_samples: int,
        use_last_pop_samples: bool = True,
    ) -> Tuple[Tensor, Tensor, Tensor, Tensor]:
        """Return particles, weights and distances of new population."""

        new_particles = []
        new_log_weights = []
        new_distances = []
        new_x = []

        num_accepted_particles = 0
        num_particles = particles.shape[0]

        while num_accepted_particles < num_particles:
            # Upperbound for batch size to not exceed simulation budget.
            num_batch = min(
                num_particles - num_accepted_particles,
                self.num_simulations - self.simulation_counter,
            )

            # Sample from previous population and perturb.
            particle_candidates = self._sample_and_perturb(
                particles, torch.exp(log_weights), num_samples=num_batch
            )
            # Simulate and select based on distance.
            candidates_repeated = particle_candidates.repeat_interleave(
                num_iid_samples, dim=0
            )
            x_candidates = self._simulate_with_budget(candidates_repeated)
            x_candidates = x_candidates.reshape((
                num_batch,
                num_iid_samples,
                -1,
            ))  # Dim(num_initial_pop, num_iid_samples, -1)
            if not self.distance.requires_iid_data:
                x_candidates = x_candidates.squeeze(1)

            dists = self.distance(self.x_o, x_candidates)
            is_accepted = dists <= epsilon
            num_accepted_batch = int(is_accepted.sum().item())

            if num_accepted_batch > 0:
                new_particles.append(particle_candidates[is_accepted])
                new_log_weights.append(
                    self._calculate_new_log_weights(
                        particle_candidates[is_accepted],
                        particles,
                        log_weights,
                    )
                )
                new_distances.append(dists[is_accepted])
                new_x.append(x_candidates[is_accepted])
                num_accepted_particles += num_accepted_batch

            # If simulation budget was exceeded and we still need particles, take
            # previous population or fill up with previous population.
            if (
                self.simulation_counter >= self.num_simulations
                and num_accepted_particles < num_particles
            ):
                if use_last_pop_samples:
                    num_remaining = num_particles - num_accepted_particles
                    self.logger.info(
                        """Simulation Budget exceeded, filling up with %s
                        samples from last population.""",
                        num_remaining,
                    )
                    # Some new particles have been accepted already, therefore
                    # fill up the remaining once with old particles and weights.
                    new_particles.append(particles[:num_remaining, :])
                    # Recalculate weights with new particles.
                    new_log_weights = [
                        self._calculate_new_log_weights(
                            torch.cat(new_particles),
                            particles,
                            log_weights,
                        )
                    ]
                    new_distances.append(distances[:num_remaining])
                    new_x.append(x[:num_remaining])
                else:
                    self.logger.info(
                        "Simulation Budget exceeded, returning previous population."
                    )
                    new_particles = [particles]
                    new_log_weights = [log_weights]
                    new_distances = [distances]
                    new_x = [x]

                break

        # collect lists of tensors into tensors
        new_particles = torch.cat(new_particles)
        new_log_weights = torch.cat(new_log_weights)
        new_distances = torch.cat(new_distances)
        new_x = torch.cat(new_x)

        # normalize the new weights
        new_log_weights -= torch.logsumexp(new_log_weights, dim=0)

        # Return sorted wrt distances.
        sort_idx = torch.argsort(new_distances)

        return (
            new_particles[sort_idx],
            new_log_weights[sort_idx],
            new_distances[sort_idx],
            new_x[sort_idx],
        )

    def _get_next_epsilon(self, distances: Tensor, quantile: float) -> float:
        """Return epsilon for next round based on quantile of this round's distances.

        Note: distances are made unique to avoid repeated distances from simulations
        that result in the same observation.

        Args:
            distances: The distances accepted in this round.
            quantile: Quantile in the distance distribution to determine new epsilon.

        Returns:
            epsilon: Epsilon for the next population.
        """
        # Take unique distances to skip same distances simulations (return is sorted).
        distances = torch.unique(distances)
        # Cumsum as cdf proxy.
        distances_cdf = torch.cumsum(distances, dim=0) / distances.sum()
        # Take the q quantile of distances.
        try:
            qidx = torch.where(distances_cdf >= quantile)[0][0]
        except IndexError:
            self.logger.warning((
                """Accepted unique distances=%s don't match quantile=%s. Selecting
                    last distance.""",
                distances,
                quantile,
            ))
            qidx = -1

        # The new epsilon is given by that distance.
        return distances[qidx].item()

    def _calculate_new_log_weights(
        self,
        new_particles: Tensor,
        old_particles: Tensor,
        old_log_weights: Tensor,
    ) -> Tensor:
        """Return new log weights following formulas in publications A,B anc C."""

        # Prior can be batched across new particles.
        prior_log_probs = self.prior.log_prob(new_particles)

        # Contstruct function to get kernel log prob for given old particle.
        # The kernel is centered on each old particle as in all three variants (A,B,C).
        def kernel_log_prob(new_particle):
            return self.get_new_kernel(old_particles).log_prob(new_particle)

        # We still have to loop over particles here because
        # the kernel log probs are already batched across old particles.
        log_weighted_sum = torch.tensor(
            [
                torch.logsumexp(old_log_weights + kernel_log_prob(new_particle), dim=0)
                for new_particle in new_particles
            ],
            dtype=torch.float32,
        )
        # new weights are prior probs over weighted sum:
        return prior_log_probs - log_weighted_sum

    @staticmethod
    def sample_from_population_with_weights(
        particles: Tensor, weights: Tensor, num_samples: int = 1
    ) -> Tensor:
        """Return samples from particles sampled with weights."""

        # define multinomial with weights as probs
        multi = Multinomial(probs=weights)
        # sample num samples, with replacement
        samples = multi.sample(sample_shape=torch.Size((num_samples,)))
        # get indices of success trials
        indices = torch.where(samples)[1]
        # return those indices from trace
        return particles[indices]

    def _sample_and_perturb(
        self, particles: Tensor, weights: Tensor, num_samples: int = 1
    ) -> Tensor:
        """Sample and perturb batch of new parameters from trace.

        Reject sampled and perturbed parameters outside of prior.
        """

        num_accepted = 0
        parameters = []
        while num_accepted < num_samples:
            parms = self.sample_from_population_with_weights(
                particles, weights, num_samples=num_samples - num_accepted
            )

            # Create kernel on params and perturb.
            parms_perturbed = self.get_new_kernel(parms).sample()

            is_within_prior = within_support(self.prior, parms_perturbed)
            num_accepted += int(is_within_prior.sum().item())

            if num_accepted > 0:
                parameters.append(parms_perturbed[is_within_prior])

        return torch.cat(parameters)

    def get_kernel_variance(
        self,
        particles: Tensor,
        weights: Tensor,
        samples_per_dim: int = 100,
        kernel_variance_scale: float = 1.0,
    ) -> Tensor:
        """Return kernel variance for a given population of particles and weights."""
        if self.kernel == "gaussian":
            # For variant C, Beaumont et al. 2009, the kernel variance comes from the
            # previous population.
            if self.algorithm_variant == "C":
                # Calculate weighted covariance of particles.
                population_cov = torch.tensor(
                    np.atleast_2d(np.cov(particles, rowvar=False, aweights=weights)),
                    dtype=torch.float32,
                )
                # Make sure variance is nonsingular.
                try:
                    torch.linalg.cholesky(kernel_variance_scale * population_cov)
                except RuntimeError:
                    self.logger.warning(
                        """"Singular particle covariance, using unit covariance."""
                    )
                    population_cov = torch.eye(particles.shape[1])
                return kernel_variance_scale * population_cov
            # While for Toni et al. and Sisson et al. it comes from the parameter
            # ranges.
            elif self.algorithm_variant in ("A", "B"):
                particle_ranges = self.get_particle_ranges(
                    particles, weights, samples_per_dim=samples_per_dim
                )
                return kernel_variance_scale * torch.diag(particle_ranges)
            else:
                raise ValueError(f"Variant, '{self.algorithm_variant}' not supported.")
        elif self.kernel == "uniform":
            # Variance spans the range of parameters for every dimension.
            return kernel_variance_scale * self.get_particle_ranges(
                particles, weights, samples_per_dim=samples_per_dim
            )
        else:
            raise ValueError(f"Kernel, '{self.kernel}' not supported.")

    def get_new_kernel(self, thetas: Tensor) -> Distribution:
        """Return new kernel distribution for a given set of paramters."""

        if self.kernel == "gaussian":
            assert self.kernel_variance is not None, "get kernel variance first."
            assert self.kernel_variance.ndim == 2
            return MultivariateNormal(
                loc=thetas, covariance_matrix=self.kernel_variance
            )

        elif self.kernel == "uniform":
            low = thetas - self.kernel_variance
            high = thetas + self.kernel_variance
            # Move batch shape to event shape to get Uniform that is multivariate in
            # parameter dimension.
            return BoxUniform(low=low, high=high)
        else:
            raise ValueError(f"Kernel, '{self.kernel}' not supported.")

    def resample_if_ess_too_small(
        self,
        particles: Tensor,
        log_weights: Tensor,
        ess_min: float,
        pop_idx: int,
    ) -> Tuple[Tensor, Tensor]:
        """Return resampled particles and uniform weights if effectice sampling size is
        too small.
        """

        num_particles = particles.shape[0]
        ess = (1 / torch.sum(torch.exp(2.0 * log_weights), dim=0)) / num_particles
        # Resampling of weights for low ESS only for Sisson et al. 2007.
        if ess < ess_min:
            self.logger.info("ESS=%s too low, resampling pop %s...", ess, pop_idx)
            # First resample, then set to uniform weights as in Sisson et al. 2007.
            particles = self.sample_from_population_with_weights(
                particles, torch.exp(log_weights), num_samples=num_particles
            )
            log_weights = torch.log(1 / num_particles * torch.ones(num_particles))

        return particles, log_weights

    def run_lra_update_weights(
        self,
        particles: Tensor,
        xs: Tensor,
        observation: Tensor,
        log_weights: Tensor,
        lra_with_weights: bool,
    ) -> Tuple[Tensor, Tensor]:
        """Return particles and weights adjusted with LRA.

        Runs (weighted) linear regression from xs onto particles to adjust the
        particles.

        Updates the SMC weights according to the new particles.
        """

        adjusted_particels = self.run_lra(
            theta=particles,
            x=xs,
            observation=observation,
            sample_weight=log_weights.exp() if lra_with_weights else None,
        )

        # Update SMC weights with LRA adjusted weights
        adjusted_log_weights = self._calculate_new_log_weights(
            new_particles=adjusted_particels,
            old_particles=particles,
            old_log_weights=log_weights,
        )

        return adjusted_particels, adjusted_log_weights

    def run_sass_set_xo(
        self,
        num_particles: int,
        num_pilot_simulations: int,
        x_o,
        num_iid_samples: int,
        lra: bool = False,
        sass_expansion_degree: int = 1,
    ) -> Callable:
        """Return transform for semi-automatic summary statistics.

        Runs an single round of rejection abc with fixed budget and accepts
        num_particles simulations to run the regression for sass.

        Sets self.x_o once the x_shape can be derived from simulations.
        """
        (
            pilot_particles,
            _,
            _,
            pilot_xs,
        ) = self._set_xo_and_sample_initial_population(
            x_o, num_particles, num_pilot_simulations, num_iid_samples
        )
        assert self.x_o is not None, "x_o not set yet."

        # Adjust with LRA.
        if lra:
            pilot_particles = self.run_lra(pilot_particles, pilot_xs, self.x_o)
        sass_transform = self.get_sass_transform(
            pilot_particles,
            pilot_xs,
            expansion_degree=sass_expansion_degree,
            sample_weight=None,
        )
        return sass_transform

    def get_particle_ranges(
        self, particles: Tensor, weights: Tensor, samples_per_dim: int = 100
    ) -> Tensor:
        """Return range of particles in each parameter dimension."""

        # get weighted samples
        samples = self.sample_from_population_with_weights(
            particles,
            weights,
            num_samples=samples_per_dim * particles.shape[1],
        )

        # Variance spans the range of particles for every dimension.
        particle_ranges = samples.max(0).values - samples.min(0).values
        assert particle_ranges.ndim < 2
        return particle_ranges

__call__(x_o, num_particles, num_initial_pop, num_simulations, epsilon_decay, distance_based_decay=False, ess_min=None, kernel_variance_scale=1.0, use_last_pop_samples=True, return_summary=False, kde=False, kde_kwargs=None, kde_sample_weights=False, lra=False, lra_with_weights=False, sass=False, sass_fraction=0.25, sass_expansion_degree=1, num_iid_samples=1)

Run SMCABC and return accepted parameters or KDE object fitted on them.

Parameters:

Name Type Description Default
x_o Union[Tensor, ndarray]

Observed data.

required
num_particles int

Number of particles in each population.

required
num_initial_pop int

Number of simulations used for initial population.

required
num_simulations int

Total number of possible simulations.

required
epsilon_decay float

Factor with which the acceptance threshold \(\epsilon\) decays.

required
distance_based_decay bool

Whether the \(\epsilon\) decay is constant over populations or calculated from the previous populations distribution of distances.

False
ess_min Optional[float]

Threshold of effective sampling size for resampling weights. Not used when None (default).

None
kernel_variance_scale float

Factor for scaling the perturbation kernel variance.

1.0
use_last_pop_samples bool

Whether to fill up the current population with samples from the previous population when the budget is used up. If False, the current population is discarded and the previous population is returned.

True
lra bool

Whether to run linear regression adjustment as in Beaumont et al. 2002

False
lra_with_weights bool

Whether to run lra as weighted linear regression with SMC weights

False
sass bool

Whether to determine semi-automatic summary statistics (sass) as in Fearnhead & Prangle 2012.

False
sass_fraction float

Fraction of simulation budget used for the initial sass run.

0.25
sass_expansion_degree int

Degree of the polynomial feature expansion for the sass regression, default 1 - no expansion.

1
kde bool

Whether to run KDE on the accepted parameters to return a KDE object from which one can sample.

False
kde_kwargs Optional[Dict[str, Any]]

kwargs for performing KDE: ‘bandwidth=’; either a float, or a string naming a bandwidth heuristics, e.g., ‘cv’ (cross validation), ‘silvermann’ or ‘scott’, default ‘cv’. ‘transform’: transform applied to the parameters before doing KDE. ‘sample_weights’: weights associated with samples. See ‘get_kde’ for more details

None
kde_sample_weights bool

Whether perform weighted KDE with SMC weights or on raw particles.

False
return_summary bool

Whether to return a dictionary with all accepted particles, weights, etc. at the end.

False
num_iid_samples int

Number of simulations per parameter. Choose num_iid_samples>1, if you have chosen a statistical distance that evaluates sets of simulations against a set of reference observations instead of a single data-point comparison.

1

Returns:

Name Type Description
theta if kde False

accepted parameters of the last population.

kde if kde True

KDE object fitted on accepted parameters, from which one can .sample() and .log_prob().

summary if return_summary True

dictionary containing the accepted paramters (if kde True), distances and simulated data x of all populations.

Source code in sbi/inference/abc/smcabc.py
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
def __call__(
    self,
    x_o: Union[Tensor, ndarray],
    num_particles: int,
    num_initial_pop: int,
    num_simulations: int,
    epsilon_decay: float,
    distance_based_decay: bool = False,
    ess_min: Optional[float] = None,
    kernel_variance_scale: float = 1.0,
    use_last_pop_samples: bool = True,
    return_summary: bool = False,
    kde: bool = False,
    kde_kwargs: Optional[Dict[str, Any]] = None,
    kde_sample_weights: bool = False,
    lra: bool = False,
    lra_with_weights: bool = False,
    sass: bool = False,
    sass_fraction: float = 0.25,
    sass_expansion_degree: int = 1,
    num_iid_samples: int = 1,
) -> Union[Tensor, KDEWrapper, Tuple[Tensor, dict], Tuple[KDEWrapper, dict]]:
    r"""Run SMCABC and return accepted parameters or KDE object fitted on them.

    Args:
        x_o: Observed data.
        num_particles: Number of particles in each population.
        num_initial_pop: Number of simulations used for initial population.
        num_simulations: Total number of possible simulations.
        epsilon_decay: Factor with which the acceptance threshold $\epsilon$ decays.
        distance_based_decay: Whether the $\epsilon$ decay is constant over
            populations or calculated from the previous populations distribution of
            distances.
        ess_min: Threshold of effective sampling size for resampling weights. Not
            used when None (default).
        kernel_variance_scale: Factor for scaling the perturbation kernel variance.
        use_last_pop_samples: Whether to fill up the current population with
            samples from the previous population when the budget is used up. If
            False, the current population is discarded and the previous population
            is returned.
        lra: Whether to run linear regression adjustment as in Beaumont et al. 2002
        lra_with_weights: Whether to run lra as weighted linear regression with SMC
            weights
        sass: Whether to determine semi-automatic summary statistics (sass) as in
            Fearnhead & Prangle 2012.
        sass_fraction: Fraction of simulation budget used for the initial sass run.
        sass_expansion_degree: Degree of the polynomial feature expansion for the
            sass regression, default 1 - no expansion.
        kde: Whether to run KDE on the accepted parameters to return a KDE
            object from which one can sample.
        kde_kwargs: kwargs for performing KDE:
            'bandwidth='; either a float, or a string naming a bandwidth
            heuristics, e.g., 'cv' (cross validation), 'silvermann' or 'scott',
            default 'cv'.
            'transform': transform applied to the parameters before doing KDE.
            'sample_weights': weights associated with samples. See 'get_kde' for
            more details
        kde_sample_weights: Whether perform weighted KDE with SMC weights or on raw
            particles.
        return_summary: Whether to return a dictionary with all accepted particles,
            weights, etc. at the end.
        num_iid_samples: Number of simulations per parameter. Choose
            `num_iid_samples>1`, if you have chosen a statistical distance that
            evaluates sets of simulations against a set of reference observations
            instead of a single data-point comparison.

    Returns:
        theta (if kde False): accepted parameters of the last population.
        kde (if kde True): KDE object fitted on accepted parameters, from which one
            can .sample() and .log_prob().
        summary (if return_summary True): dictionary containing the accepted
            paramters (if kde True), distances and simulated data x of all
            populations.
    """

    pop_idx = 0
    self.num_simulations = num_simulations * num_iid_samples
    if kde_kwargs is None:
        kde_kwargs = {}
    assert isinstance(epsilon_decay, float) and epsilon_decay > 0.0
    assert not (
        self.distance.requires_iid_data and lra
    ), "Currently there is no support to run inference "
    "on multiple observations together with lra."
    assert not (
        self.distance.requires_iid_data and sass
    ), "Currently there is no support to run inference "
    "on multiple observations together with sass."

    # Pilot run for SASS.
    if sass:
        num_pilot_simulations = int(sass_fraction * num_simulations)
        self.logger.info(
            "Running SASS with %s pilot samples.", num_pilot_simulations
        )
        sass_transform = self.run_sass_set_xo(
            num_particles,
            num_pilot_simulations,
            x_o,
            num_iid_samples,
            lra,
            sass_expansion_degree,
        )
        # Udpate simulator and xo
        x_o = sass_transform(self.x_o)

        def sass_simulator(theta):
            self.simulation_counter += theta.shape[0]
            return sass_transform(self._batched_simulator(theta))

        self._simulate_with_budget = sass_simulator

    # run initial population
    particles, epsilon, distances, x = self._set_xo_and_sample_initial_population(
        x_o, num_particles, num_initial_pop, num_iid_samples
    )
    log_weights = torch.log(1 / num_particles * torch.ones(num_particles))

    self.logger.info((
        "population=%s, eps=%s, ess=%s, num_sims=%s",
        pop_idx,
        epsilon,
        1.0,
        num_initial_pop,
    ))

    all_particles = [particles]
    all_log_weights = [log_weights]
    all_distances = [distances]
    all_epsilons = [epsilon]
    all_x = [x]

    while self.simulation_counter < self.num_simulations:
        pop_idx += 1
        # Decay based on quantile of distances from previous pop.
        if distance_based_decay:
            epsilon = self._get_next_epsilon(
                all_distances[pop_idx - 1], epsilon_decay
            )
        # Constant decay.
        else:
            epsilon *= epsilon_decay

        # Get kernel variance from previous pop.
        self.kernel_variance = self.get_kernel_variance(
            all_particles[pop_idx - 1],
            torch.exp(all_log_weights[pop_idx - 1]),
            samples_per_dim=500,
            kernel_variance_scale=kernel_variance_scale,
        )
        particles, log_weights, distances, x = self._sample_next_population(
            particles=all_particles[pop_idx - 1],
            log_weights=all_log_weights[pop_idx - 1],
            distances=all_distances[pop_idx - 1],
            epsilon=epsilon,
            x=all_x[pop_idx - 1],
            num_iid_samples=num_iid_samples,
            use_last_pop_samples=use_last_pop_samples,
        )

        # Resample population if effective sampling size is too small.
        if ess_min is not None:
            particles, log_weights = self.resample_if_ess_too_small(
                particles, log_weights, ess_min, pop_idx
            )

        self.logger.info((
            "population=%s done: eps={epsilon:.6f}, num_sims=%s.",
            pop_idx,
            epsilon,
            self.simulation_counter,
        ))

        # collect results
        all_particles.append(particles)
        all_log_weights.append(log_weights)
        all_distances.append(distances)
        all_epsilons.append(epsilon)
        all_x.append(x)

    # Maybe run LRA and adjust weights.
    if lra:
        self.logger.info("Running Linear regression adjustment.")
        adjusted_particles, _ = self.run_lra_update_weights(
            particles=all_particles[-1],
            xs=all_x[-1],
            observation=process_x(x_o),
            log_weights=all_log_weights[-1],
            lra_with_weights=lra_with_weights,
        )
        final_particles = adjusted_particles
    else:
        final_particles = all_particles[-1]

    if kde:
        self.logger.info(
            """KDE on %s samples with bandwidth option %s. Beware that KDE can give
            unreliable results when used with too few samples and in high
            dimensions.""",
            final_particles.shape[0],
            kde_kwargs.get("bandwidth", "cv"),
        )
        # Maybe get particles weights from last population for weighted KDE.
        if kde_sample_weights:
            kde_kwargs["sample_weights"] = all_log_weights[-1].exp()

        kde_dist = get_kde(final_particles, **kde_kwargs)

        if return_summary:
            return (
                kde_dist,
                dict(
                    particles=all_particles,
                    weights=all_log_weights,
                    epsilons=all_epsilons,
                    distances=all_distances,
                    xs=all_x,
                ),
            )
        else:
            return kde_dist

    if return_summary:
        return (
            final_particles,
            dict(
                particles=all_particles,
                weights=all_log_weights,
                epsilons=all_epsilons,
                distances=all_distances,
                xs=all_x,
            ),
        )
    else:
        return final_particles

__init__(simulator, prior, distance='l2', requires_iid_data=None, distance_kwargs=None, num_workers=1, simulation_batch_size=1, distance_batch_size=-1, show_progress_bars=True, kernel='gaussian', algorithm_variant='C')

Sequential Monte Carlo Approximate Bayesian Computation.

We distinguish between three different SMC methods here
  • A: Toni et al. 2010 (Phd Thesis)
  • B: Sisson et al. 2007 (with correction from 2009)
  • C: Beaumont et al. 2009

In Toni et al. 2010 we find an overview of the differences on page 34: - B: same as A except for resampling of weights if the effective sampling size is too small. - C: same as A except for calculation of the covariance of the perturbation kernel: the kernel covariance is a scaled version of the covariance of the previous population.

Parameters:

Name Type Description Default
simulator Callable

A function that takes parameters \(\theta\) and maps them to simulations, or observations, x, \(\mathrm{sim}(\theta)\to x\). Any regular Python callable (i.e. function or class with __call__ method) can be used.

required
prior Distribution

A probability distribution that expresses prior knowledge about the parameters, e.g. which ranges are meaningful for them. Any object with .log_prob()and .sample() (for example, a PyTorch distribution) can be used.

required
distance Union[str, Callable]

Distance function to compare observed and simulated data. Can be a custom callable function or one of l1, l2, mse, mmd, wasserstein.

'l2'
requires_iid_data Optional[None]

Whether to allow conditioning on iid sampled data or not. Typically, this information is inferred by the choice of the distance, but in case a custom distance is used, this information is pivotal.

None
distance_kwargs Optional[Dict]

Configurations parameters for the distances. In particular useful for the MMD and Wasserstein distance.

None
num_workers int

Number of parallel workers to use for simulations.

1
simulation_batch_size int

Number of parameter sets that the simulator maps to data x at once. If None, we simulate all parameter sets at the same time. If >= 1, the simulator has to process data of shape (simulation_batch_size, parameter_dimension).

1
distance_batch_size int

Number of simulations that the distance function evaluates against the reference observations at once. If -1, we evaluate all simulations at the same time.

-1
show_progress_bars bool

Whether to show a progressbar during simulation and sampling.

True
kernel Optional[str]

Perturbation kernel.

'gaussian'
algorithm_variant str

Indicating the choice of algorithm variant, A, B, or C.

'C'
Source code in sbi/inference/abc/smcabc.py
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def __init__(
    self,
    simulator: Callable,
    prior: Distribution,
    distance: Union[str, Callable] = "l2",
    requires_iid_data: Optional[None] = None,
    distance_kwargs: Optional[Dict] = None,
    num_workers: int = 1,
    simulation_batch_size: int = 1,
    distance_batch_size: int = -1,
    show_progress_bars: bool = True,
    kernel: Optional[str] = "gaussian",
    algorithm_variant: str = "C",
):
    r"""Sequential Monte Carlo Approximate Bayesian Computation.

    We distinguish between three different SMC methods here:
        - A: Toni et al. 2010 (Phd Thesis)
        - B: Sisson et al. 2007 (with correction from 2009)
        - C: Beaumont et al. 2009

    In Toni et al. 2010 we find an overview of the differences on page 34:
        - B: same as A except for resampling of weights if the effective sampling
            size is too small.
        - C: same as A except for calculation of the covariance of the perturbation
            kernel: the kernel covariance is a scaled version of the covariance of
            the previous population.

    Args:
        simulator: A function that takes parameters $\theta$ and maps them to
            simulations, or observations, `x`, $\mathrm{sim}(\theta)\to x$. Any
            regular Python callable (i.e. function or class with `__call__` method)
            can be used.
        prior: A probability distribution that expresses prior knowledge about the
            parameters, e.g. which ranges are meaningful for them. Any
            object with `.log_prob()`and `.sample()` (for example, a PyTorch
            distribution) can be used.
        distance: Distance function to compare observed and simulated data. Can be
            a custom callable function or one of `l1`, `l2`, `mse`,
            `mmd`, `wasserstein`.
        requires_iid_data: Whether to allow conditioning on iid sampled data or not.
            Typically, this information is inferred by the choice of the distance,
            but in case a custom distance is used, this information is pivotal.
        distance_kwargs: Configurations parameters for the distances. In particular
            useful for the MMD and Wasserstein distance.
        num_workers: Number of parallel workers to use for simulations.
        simulation_batch_size: Number of parameter sets that the simulator
            maps to data x at once. If None, we simulate all parameter sets at the
            same time. If >= 1, the simulator has to process data of shape
            (simulation_batch_size, parameter_dimension).
        distance_batch_size: Number of simulations that the distance function
            evaluates against the reference observations at once. If -1, we evaluate
            all simulations at the same time.
        show_progress_bars: Whether to show a progressbar during simulation and
            sampling.
        kernel: Perturbation kernel.
        algorithm_variant: Indicating the choice of algorithm variant, A, B, or C.
    """

    super().__init__(
        simulator=simulator,
        prior=prior,
        distance=distance,
        requires_iid_data=requires_iid_data,
        distance_kwargs=distance_kwargs,
        num_workers=num_workers,
        simulation_batch_size=simulation_batch_size,
        distance_batch_size=distance_batch_size,
        show_progress_bars=show_progress_bars,
    )

    kernels = ("gaussian", "uniform")
    assert (
        kernel in kernels
    ), f"Kernel '{kernel}' not supported. Choose one from {kernels}."
    self.kernel = kernel

    algorithm_variants = ("A", "B", "C")
    assert algorithm_variant in algorithm_variants, (
        f"SMCABC variant '{algorithm_variant}' not supported, choose one from"
        " {algorithm_variants}."
    )
    self.algorithm_variant = algorithm_variant
    self.distance_to_x0 = None
    self.simulation_counter = 0
    self.num_simulations = 0
    self.kernel_variance = None

    # Define simulator that keeps track of budget.
    def simulate_with_budget(theta):
        self.simulation_counter += theta.shape[0]
        return self._batched_simulator(theta)

    self._simulate_with_budget = simulate_with_budget

get_kernel_variance(particles, weights, samples_per_dim=100, kernel_variance_scale=1.0)

Return kernel variance for a given population of particles and weights.

Source code in sbi/inference/abc/smcabc.py
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
def get_kernel_variance(
    self,
    particles: Tensor,
    weights: Tensor,
    samples_per_dim: int = 100,
    kernel_variance_scale: float = 1.0,
) -> Tensor:
    """Return kernel variance for a given population of particles and weights."""
    if self.kernel == "gaussian":
        # For variant C, Beaumont et al. 2009, the kernel variance comes from the
        # previous population.
        if self.algorithm_variant == "C":
            # Calculate weighted covariance of particles.
            population_cov = torch.tensor(
                np.atleast_2d(np.cov(particles, rowvar=False, aweights=weights)),
                dtype=torch.float32,
            )
            # Make sure variance is nonsingular.
            try:
                torch.linalg.cholesky(kernel_variance_scale * population_cov)
            except RuntimeError:
                self.logger.warning(
                    """"Singular particle covariance, using unit covariance."""
                )
                population_cov = torch.eye(particles.shape[1])
            return kernel_variance_scale * population_cov
        # While for Toni et al. and Sisson et al. it comes from the parameter
        # ranges.
        elif self.algorithm_variant in ("A", "B"):
            particle_ranges = self.get_particle_ranges(
                particles, weights, samples_per_dim=samples_per_dim
            )
            return kernel_variance_scale * torch.diag(particle_ranges)
        else:
            raise ValueError(f"Variant, '{self.algorithm_variant}' not supported.")
    elif self.kernel == "uniform":
        # Variance spans the range of parameters for every dimension.
        return kernel_variance_scale * self.get_particle_ranges(
            particles, weights, samples_per_dim=samples_per_dim
        )
    else:
        raise ValueError(f"Kernel, '{self.kernel}' not supported.")

get_new_kernel(thetas)

Return new kernel distribution for a given set of paramters.

Source code in sbi/inference/abc/smcabc.py
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
def get_new_kernel(self, thetas: Tensor) -> Distribution:
    """Return new kernel distribution for a given set of paramters."""

    if self.kernel == "gaussian":
        assert self.kernel_variance is not None, "get kernel variance first."
        assert self.kernel_variance.ndim == 2
        return MultivariateNormal(
            loc=thetas, covariance_matrix=self.kernel_variance
        )

    elif self.kernel == "uniform":
        low = thetas - self.kernel_variance
        high = thetas + self.kernel_variance
        # Move batch shape to event shape to get Uniform that is multivariate in
        # parameter dimension.
        return BoxUniform(low=low, high=high)
    else:
        raise ValueError(f"Kernel, '{self.kernel}' not supported.")

get_particle_ranges(particles, weights, samples_per_dim=100)

Return range of particles in each parameter dimension.

Source code in sbi/inference/abc/smcabc.py
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
def get_particle_ranges(
    self, particles: Tensor, weights: Tensor, samples_per_dim: int = 100
) -> Tensor:
    """Return range of particles in each parameter dimension."""

    # get weighted samples
    samples = self.sample_from_population_with_weights(
        particles,
        weights,
        num_samples=samples_per_dim * particles.shape[1],
    )

    # Variance spans the range of particles for every dimension.
    particle_ranges = samples.max(0).values - samples.min(0).values
    assert particle_ranges.ndim < 2
    return particle_ranges

resample_if_ess_too_small(particles, log_weights, ess_min, pop_idx)

Return resampled particles and uniform weights if effectice sampling size is too small.

Source code in sbi/inference/abc/smcabc.py
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
def resample_if_ess_too_small(
    self,
    particles: Tensor,
    log_weights: Tensor,
    ess_min: float,
    pop_idx: int,
) -> Tuple[Tensor, Tensor]:
    """Return resampled particles and uniform weights if effectice sampling size is
    too small.
    """

    num_particles = particles.shape[0]
    ess = (1 / torch.sum(torch.exp(2.0 * log_weights), dim=0)) / num_particles
    # Resampling of weights for low ESS only for Sisson et al. 2007.
    if ess < ess_min:
        self.logger.info("ESS=%s too low, resampling pop %s...", ess, pop_idx)
        # First resample, then set to uniform weights as in Sisson et al. 2007.
        particles = self.sample_from_population_with_weights(
            particles, torch.exp(log_weights), num_samples=num_particles
        )
        log_weights = torch.log(1 / num_particles * torch.ones(num_particles))

    return particles, log_weights

run_lra_update_weights(particles, xs, observation, log_weights, lra_with_weights)

Return particles and weights adjusted with LRA.

Runs (weighted) linear regression from xs onto particles to adjust the particles.

Updates the SMC weights according to the new particles.

Source code in sbi/inference/abc/smcabc.py
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
def run_lra_update_weights(
    self,
    particles: Tensor,
    xs: Tensor,
    observation: Tensor,
    log_weights: Tensor,
    lra_with_weights: bool,
) -> Tuple[Tensor, Tensor]:
    """Return particles and weights adjusted with LRA.

    Runs (weighted) linear regression from xs onto particles to adjust the
    particles.

    Updates the SMC weights according to the new particles.
    """

    adjusted_particels = self.run_lra(
        theta=particles,
        x=xs,
        observation=observation,
        sample_weight=log_weights.exp() if lra_with_weights else None,
    )

    # Update SMC weights with LRA adjusted weights
    adjusted_log_weights = self._calculate_new_log_weights(
        new_particles=adjusted_particels,
        old_particles=particles,
        old_log_weights=log_weights,
    )

    return adjusted_particels, adjusted_log_weights

run_sass_set_xo(num_particles, num_pilot_simulations, x_o, num_iid_samples, lra=False, sass_expansion_degree=1)

Return transform for semi-automatic summary statistics.

Runs an single round of rejection abc with fixed budget and accepts num_particles simulations to run the regression for sass.

Sets self.x_o once the x_shape can be derived from simulations.

Source code in sbi/inference/abc/smcabc.py
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
def run_sass_set_xo(
    self,
    num_particles: int,
    num_pilot_simulations: int,
    x_o,
    num_iid_samples: int,
    lra: bool = False,
    sass_expansion_degree: int = 1,
) -> Callable:
    """Return transform for semi-automatic summary statistics.

    Runs an single round of rejection abc with fixed budget and accepts
    num_particles simulations to run the regression for sass.

    Sets self.x_o once the x_shape can be derived from simulations.
    """
    (
        pilot_particles,
        _,
        _,
        pilot_xs,
    ) = self._set_xo_and_sample_initial_population(
        x_o, num_particles, num_pilot_simulations, num_iid_samples
    )
    assert self.x_o is not None, "x_o not set yet."

    # Adjust with LRA.
    if lra:
        pilot_particles = self.run_lra(pilot_particles, pilot_xs, self.x_o)
    sass_transform = self.get_sass_transform(
        pilot_particles,
        pilot_xs,
        expansion_degree=sass_expansion_degree,
        sample_weight=None,
    )
    return sass_transform

sample_from_population_with_weights(particles, weights, num_samples=1) staticmethod

Return samples from particles sampled with weights.

Source code in sbi/inference/abc/smcabc.py
587
588
589
590
591
592
593
594
595
596
597
598
599
600
@staticmethod
def sample_from_population_with_weights(
    particles: Tensor, weights: Tensor, num_samples: int = 1
) -> Tensor:
    """Return samples from particles sampled with weights."""

    # define multinomial with weights as probs
    multi = Multinomial(probs=weights)
    # sample num samples, with replacement
    samples = multi.sample(sample_shape=torch.Size((num_samples,)))
    # get indices of success trials
    indices = torch.where(samples)[1]
    # return those indices from trace
    return particles[indices]

Helpers

simulate_for_sbi(simulator, proposal, num_simulations, num_workers=1, simulation_batch_size=1, seed=None, show_progress_bar=True)

Returns (\(\theta, x\)) pairs obtained from sampling the proposal and simulating.

This function performs two steps:

  • Sample parameters \(\theta\) from the proposal.
  • Simulate these parameters to obtain \(x\).

Parameters:

Name Type Description Default
simulator Callable

A function that takes parameters \(\theta\) and maps them to simulations, or observations, x, \(\text{sim}(\theta)\to x\). Any regular Python callable (i.e. function or class with __call__ method) can be used. Note that the simulator should be able to handle numpy arrays for efficient parallelization. You can use process_simulator to ensure this.

required
proposal Any

Probability distribution that the parameters \(\theta\) are sampled from.

required
num_simulations int

Number of simulations that are run.

required
num_workers int

Number of parallel workers to use for simulations.

1
simulation_batch_size Union[int, None]

Number of parameter sets of shape (simulation_batch_size, parameter_dimension) that the simulator receives per call. If None, we set simulation_batch_size=num_simulations and simulate all parameter sets with one call. Otherwise, we construct batches of parameter sets and distribute them among num_workers.

1
seed Optional[int]

Seed for reproducibility.

None
show_progress_bar bool

Whether to show a progress bar for simulating. This will not affect whether there will be a progressbar while drawing samples from the proposal.

True

Returns: Sampled parameters \(\theta\) and simulation-outputs \(x\).

Source code in sbi/utils/simulation_utils.py
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
def simulate_for_sbi(
    simulator: Callable,
    proposal: Any,
    num_simulations: int,
    num_workers: int = 1,
    simulation_batch_size: Union[int, None] = 1,
    seed: Optional[int] = None,
    show_progress_bar: bool = True,
) -> Tuple[Tensor, Tensor]:
    r"""Returns ($\theta, x$) pairs obtained from sampling the proposal and simulating.

    This function performs two steps:

    - Sample parameters $\theta$ from the `proposal`.
    - Simulate these parameters to obtain $x$.

    Args:
        simulator: A function that takes parameters $\theta$ and maps them to
            simulations, or observations, `x`, $\text{sim}(\theta)\to x$. Any
            regular Python callable (i.e. function or class with `__call__` method)
            can be used. Note that the simulator should be able to handle numpy
            arrays for efficient parallelization. You can use
            `process_simulator` to ensure this.
        proposal: Probability distribution that the parameters $\theta$ are sampled
            from.
        num_simulations: Number of simulations that are run.
        num_workers: Number of parallel workers to use for simulations.
        simulation_batch_size: Number of parameter sets of shape
            (simulation_batch_size, parameter_dimension) that the simulator
            receives per call. If None, we set
            simulation_batch_size=num_simulations and simulate all parameter
            sets with one call. Otherwise, we construct batches of parameter
            sets and distribute them among num_workers.
        seed: Seed for reproducibility.
        show_progress_bar: Whether to show a progress bar for simulating. This will not
            affect whether there will be a progressbar while drawing samples from the
            proposal.

    Returns: Sampled parameters $\theta$ and simulation-outputs $x$.
    """

    if num_simulations == 0:
        theta = torch.tensor([], dtype=float32)
        x = torch.tensor([], dtype=float32)

    else:
        # Cast theta to numpy for better joblib performance (seee #1175)
        seed_all_backends(seed)
        theta = proposal.sample((num_simulations,))

        # Parse the simulation_batch_size logic
        if simulation_batch_size is None:
            simulation_batch_size = num_simulations
        else:
            simulation_batch_size = min(simulation_batch_size, num_simulations)

        if num_workers != 1:
            # For multiprocessing, we want to switch to numpy arrays.
            # The batch size will be an approximation, since np.array_split does
            # not take as argument the size of the batch but their total.
            num_batches = num_simulations // simulation_batch_size
            batches = np.array_split(theta.numpy(), num_batches, axis=0)
            batch_seeds = np.random.randint(low=0, high=1_000_000, size=(len(batches),))

            # define seeded simulator.
            def simulator_seeded(theta: ndarray, seed: int) -> Tensor:
                seed_all_backends(seed)
                return simulator(theta)

            try:  # catch TypeError to give more informative error message
                simulation_outputs: list[Tensor] = [  # pyright: ignore
                    xx
                    for xx in tqdm(
                        Parallel(return_as="generator", n_jobs=num_workers)(
                            delayed(simulator_seeded)(batch, seed)
                            for batch, seed in zip(batches, batch_seeds)
                        ),
                        total=num_simulations,
                        disable=not show_progress_bar,
                    )
                ]
            except TypeError as err:
                raise TypeError(
                    "For multiprocessing, we switch to numpy arrays. Make sure to "
                    "preprocess your simulator with `process_simulator` to handle numpy"
                    " arrays."
                ) from err

        else:
            simulation_outputs: list[Tensor] = []
            batches = torch.split(theta, simulation_batch_size)
            for batch in tqdm(batches, disable=not show_progress_bar):
                simulation_outputs.append(simulator(batch))

        # Correctly format the output
        x = torch.cat(simulation_outputs, dim=0)
        theta = torch.as_tensor(theta, dtype=float32)

    return theta, x

process_prior(prior, custom_prior_wrapper_kwargs=None)

Return PyTorch distribution-like prior from user-provided prior.

NOTE: If the prior argument is a sequence of PyTorch distributions, they will be interpreted as independent prior dimensions wrapped in a MultipleIndependent pytorch Distribution. In case the elements are not PyTorch distributions, make sure to use process_prior on each element in the list beforehand. See FAQ 7 for details.

NOTE: returns a tuple (processed_prior, num_params, whether_prior_returns_numpy). The last two entries in the tuple can be passed on to process_simulator to prepare the simulator as well. For example, it will take care of casting parameters to numpy or adding a batch dimension to the simulator output, if needed.

Parameters:

Name Type Description Default
prior Union[Sequence[Distribution], Distribution, rv_frozen, multi_rv_frozen]

Prior object with .sample() and .log_prob() as provided by the user, or a sequence of such objects.

required
custom_prior_wrapper_kwargs Optional[Dict]

kwargs to be passed to the class that wraps a custom prior into a pytorch Distribution, e.g., for passing bounds for a prior with bounded support (lower_bound, upper_bound), or argument constraints. (arg_constraints), see pytorch.distributions.Distribution for more info.

None

Raises:

Type Description
AttributeError

If prior objects lacks .sample() or .log_prob().

Returns:

Name Type Description
prior Distribution

Prior that emits samples and evaluates log prob as PyTorch Tensors.

theta_numel int

Number of parameters - elements in a single sample from the prior.

prior_returns_numpy bool

Whether the return type of the prior was a Numpy array.

Source code in sbi/utils/user_input_checks.py
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
def process_prior(
    prior: Union[Sequence[Distribution], Distribution, rv_frozen, multi_rv_frozen],
    custom_prior_wrapper_kwargs: Optional[Dict] = None,
) -> Tuple[Distribution, int, bool]:
    """Return PyTorch distribution-like prior from user-provided prior.

    NOTE: If the prior argument is a sequence of PyTorch distributions, they will be
    interpreted as independent prior dimensions wrapped in a `MultipleIndependent`
    pytorch Distribution. In case the elements are not PyTorch distributions, make sure
    to use process_prior on each element in the list beforehand. See FAQ 7 for details.

    NOTE: returns a tuple (processed_prior, num_params, whether_prior_returns_numpy).
    The last two entries in the tuple can be passed on to `process_simulator` to prepare
    the simulator as well. For example, it will take care of casting parameters to numpy
    or adding a batch dimension to the simulator output, if needed.

    Args:
        prior: Prior object with `.sample()` and `.log_prob()` as provided by the user,
            or a sequence of such objects.
        custom_prior_wrapper_kwargs: kwargs to be passed to the class that wraps a
            custom prior into a pytorch Distribution, e.g., for passing bounds for a
            prior with bounded support (lower_bound, upper_bound), or argument
            constraints.
            (arg_constraints), see pytorch.distributions.Distribution for more info.

    Raises:
        AttributeError: If prior objects lacks `.sample()` or `.log_prob()`.

    Returns:
        prior: Prior that emits samples and evaluates log prob as PyTorch Tensors.
        theta_numel: Number of parameters - elements in a single sample from the prior.
        prior_returns_numpy: Whether the return type of the prior was a Numpy array.
    """

    # If prior is a sequence, assume independent components and check as PyTorch prior.
    if isinstance(prior, Sequence):
        warnings.warn(
            f"Prior was provided as a sequence of {len(prior)} priors. They will be "
            "interpreted as independent of each other and matched in order to the "
            "components of the parameter.",
            stacklevel=2,
        )
        # process individual priors
        prior = [process_prior(p, custom_prior_wrapper_kwargs)[0] for p in prior]
        return process_pytorch_prior(MultipleIndependent(prior))

    if isinstance(prior, Distribution):
        return process_pytorch_prior(prior)

    # If prior is given as `scipy.stats` object, wrap as PyTorch.
    elif isinstance(prior, (rv_frozen, multi_rv_frozen)):
        raise NotImplementedError(
            "Passing a prior as scipy.stats object is deprecated. "
            "Please pass it as a PyTorch Distribution."
        )

    # Otherwise it is a custom prior - check for `.sample()` and `.log_prob()`.
    else:
        return process_custom_prior(prior, custom_prior_wrapper_kwargs)

process_simulator(user_simulator, prior, is_numpy_simulator)

Returns a simulator that meets the requirements for usage in sbi.

Parameters:

Name Type Description Default
user_simulator Callable

simulator provided by the user, possibly written in numpy.

required
prior Distribution

prior as pytorch distribution or processed with process_prior.

required
is_numpy_simulator bool

whether the simulator needs theta in numpy types, returned from process_prior.

required

Returns:

Name Type Description
simulator Callable

processed simulator that returns torch.Tensor can handle batches of parameters.

Source code in sbi/utils/user_input_checks.py
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
def process_simulator(
    user_simulator: Callable,
    prior: Distribution,
    is_numpy_simulator: bool,
) -> Callable:
    """Returns a simulator that meets the requirements for usage in sbi.

    Args:
        user_simulator: simulator provided by the user, possibly written in numpy.
        prior: prior as pytorch distribution or processed with `process_prior`.
        is_numpy_simulator: whether the simulator needs theta in numpy types, returned
            from `process_prior`.

    Returns:
        simulator: processed simulator that returns `torch.Tensor` can handle batches
            of parameters.
    """

    assert isinstance(user_simulator, Callable), "Simulator must be a function."

    joblib_simulator = wrap_as_joblib_efficient_simulator(
        user_simulator, prior, is_numpy_simulator
    )

    batch_simulator = ensure_batched_simulator(joblib_simulator, prior)

    return batch_simulator