diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index c32c7de3..0fe686b9 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -25,7 +25,7 @@ repos: - id: mypy # Copied from setup.cfg exclude: ^test/ - additional_dependencies: [ numpy >= 1.22, types-ujson ] + additional_dependencies: [ numpy >= 1.22] # local uses the user-installed pylint, this allows dependency checking - repo: local hooks: diff --git a/.pylintrc b/.pylintrc index d612cb02..03dbfe01 100644 --- a/.pylintrc +++ b/.pylintrc @@ -3,7 +3,7 @@ # A comma-separated list of package or module names from where C extensions may # be loaded. Extensions are loading into the active Python interpreter and may # run arbitrary code. -extension-pkg-whitelist=ujson +extension-pkg-whitelist= # Add files or directories to the blacklist. They should be base names, not # paths. diff --git a/cmdstanpy/model.py b/cmdstanpy/model.py index d45a4358..7d4ab8d6 100644 --- a/cmdstanpy/model.py +++ b/cmdstanpy/model.py @@ -1,22 +1,22 @@ """CmdStanModel""" import io +import json import os import platform import re import shutil import subprocess import sys +import threading from collections import OrderedDict from concurrent.futures import ThreadPoolExecutor from datetime import datetime from io import StringIO from multiprocessing import cpu_count from pathlib import Path -import threading from typing import Any, Callable, Dict, Iterable, List, Mapping, Optional, Union -import ujson as json from tqdm.auto import tqdm from cmdstanpy import _CMDSTAN_REFRESH, _CMDSTAN_SAMPLING, _CMDSTAN_WARMUP @@ -1587,6 +1587,7 @@ def _run_cmdstan( env=os.environ, universal_newlines=True, ) + timer: Optional[threading.Timer] if timeout: def _timer_target() -> None: diff --git a/cmdstanpy/utils/__init__.py b/cmdstanpy/utils/__init__.py index 8b02f168..5b245bf8 100644 --- a/cmdstanpy/utils/__init__.py +++ b/cmdstanpy/utils/__init__.py @@ -85,7 +85,7 @@ def show_versions(output: bool = True) -> str: except Exception: deps_info.append(('cmdstan', 'NOT FOUND')) - deps = ['cmdstanpy', 'pandas', 'xarray', 'tqdm', 'numpy', 'ujson'] + deps = ['cmdstanpy', 'pandas', 'xarray', 'tqdm', 'numpy'] for module in deps: try: if module in sys.modules: diff --git a/cmdstanpy/utils/json.py b/cmdstanpy/utils/json.py index dcb6b604..07828ce5 100644 --- a/cmdstanpy/utils/json.py +++ b/cmdstanpy/utils/json.py @@ -2,30 +2,10 @@ Utilities for writing Stan Json files """ import json -import math from collections.abc import Collection -from typing import Any, List, Mapping, Union +from typing import Any, List, Mapping import numpy as np -import ujson - -from .logging import get_logger - - -def rewrite_inf_nan( - data: Union[float, int, List[Any]] -) -> Union[str, int, float, List[Any]]: - """Replaces NaN and Infinity with string representations""" - if isinstance(data, float): - if math.isnan(data): - return 'NaN' - if math.isinf(data): - return ('+' if data > 0 else '-') + 'inf' - return data - elif isinstance(data, list): - return [rewrite_inf_nan(item) for item in data] - else: - return data def serialize_complex(c: Any) -> List[float]: @@ -56,7 +36,6 @@ def write_stan_json(path: str, data: Mapping[str, Any]) -> None: """ data_out = {} for key, val in data.items(): - handle_nan_inf = False if val is not None: if isinstance(val, (str, bytes)) or ( type(val).__module__ != 'numpy' @@ -67,9 +46,9 @@ def write_stan_json(path: str, data: Mapping[str, Any]) -> None: + f"write_stan_json for key '{key}'" ) try: - handle_nan_inf = not np.all(np.isfinite(val)) - except TypeError: # handles cases like val == ['hello'] + np.isfinite(val) + except TypeError: # pylint: disable=raise-missing-from raise ValueError( "Invalid type provided to " @@ -86,12 +65,5 @@ def write_stan_json(path: str, data: Mapping[str, Any]) -> None: else: data_out[key] = val - if handle_nan_inf: - data_out[key] = rewrite_inf_nan(data_out[key]) - with open(path, 'w') as fd: - try: - ujson.dump(data_out, fd) - except TypeError as e: - get_logger().debug(e) - json.dump(data_out, fd, default=serialize_complex) + json.dump(data_out, fd, default=serialize_complex) diff --git a/cmdstanpy/utils/stancsv.py b/cmdstanpy/utils/stancsv.py index ace3f1cd..216c253d 100644 --- a/cmdstanpy/utils/stancsv.py +++ b/cmdstanpy/utils/stancsv.py @@ -1,6 +1,7 @@ """ Utility functions for reading the Stan CSV format """ +import json import math import re from enum import Enum, auto @@ -17,7 +18,6 @@ import numpy as np import pandas as pd -import ujson from cmdstanpy import _CMDSTAN_SAMPLING, _CMDSTAN_THIN, _CMDSTAN_WARMUP @@ -453,7 +453,7 @@ def read_metric(path: str) -> List[int]: """ if path.endswith('.json'): with open(path, 'r') as fd: - metric_dict = ujson.load(fd) + metric_dict = json.load(fd) if 'inv_metric' in metric_dict: dims_np: np.ndarray = np.asarray(metric_dict['inv_metric']) return list(dims_np.shape) diff --git a/docsrc/users-guide/examples/Run Generated Quantities.ipynb b/docsrc/users-guide/examples/Run Generated Quantities.ipynb index 87cbd793..403befa2 100644 --- a/docsrc/users-guide/examples/Run Generated Quantities.ipynb +++ b/docsrc/users-guide/examples/Run Generated Quantities.ipynb @@ -2,6 +2,7 @@ "cells": [ { "cell_type": "markdown", + "metadata": {}, "source": [ "# Generating new quantities of interest.\n", "\n", @@ -19,11 +20,11 @@ "- transform parameters for reporting\n", "- apply full Bayesian decision theory\n", "- calculate log likelihoods, deviances, etc. for model comparison" - ], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "## Example: add posterior predictive checks to `bernoulli.stan`\n", "\n", @@ -34,12 +35,13 @@ "We instantiate the model `bernoulli`,\n", "as in the \"Hello World\" section\n", "of the CmdStanPy [tutorial](https://github.com/stan-dev/cmdstanpy/blob/develop/cmdstanpy_tutorial.ipynb) notebook." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "import os\n", "from cmdstanpy import cmdstan_path, CmdStanModel, CmdStanMCMC, CmdStanGQ\n", @@ -51,153 +53,151 @@ "# instantiate, compile bernoulli model\n", "model = CmdStanModel(stan_file=stan_file)\n", "print(model.code())" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "The input data consists of `N` - the number of bernoulli trials and `y` - the list of observed outcomes.\n", "Inspection of the data shows that on average, there is a 20% chance of success for any given Bernoulli trial." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# examine bernoulli data\n", - "import ujson\n", + "import json\n", "import statistics\n", "with open(data_file,'r') as fp:\n", - " data_dict = ujson.load(fp)\n", + " data_dict = json.load(fp)\n", "print(data_dict)\n", "print('mean of y: {}'.format(statistics.mean(data_dict['y'])))" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "As in the \"Hello World\" tutorial, we produce a sample from the posterior of the model conditioned on the data:" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "# fit the model to the data\n", "fit = model.sample(data=data_file)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "The fitted model produces an estimate of `theta` - the chance of success" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "fit.summary()" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "To run a prior predictive check, we add a `generated quantities` block to the model, in which we generate a new data vector `y_rep` using the current estimate of theta. The resulting model is in file [bernoulli_ppc.stan](https://github.com/stan-dev/cmdstanpy/blob/master/test/data/bernoulli_ppc.stan)" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "model_ppc = CmdStanModel(stan_file='bernoulli_ppc.stan')\n", "print(model_ppc.code())" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We run the `generate_quantities` method on `bernoulli_ppc` using existing sample `fit` as input. The `generate_quantities` method takes the values of `theta` in the `fit` sample as the set of draws from the posterior used to generate the corresponsing `y_rep` quantities of interest.\n", "\n", "The arguments to the `generate_quantities` method are:\n", " + `data` - the data used to fit the model\n", " + `mcmc_sample` - either a `CmdStanMCMC` object or a list of stan-csv files\n" - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "new_quantities = model_ppc.generate_quantities(data=data_file, mcmc_sample=fit)" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "The `generate_quantities` method returns a `CmdStanGQ` object which contains the values for all variables in the generated quantitites block of the program ``bernoulli_ppc.stan``. Unlike the output from the ``sample`` method, it doesn't contain any information on the joint log probability density, sampler state, or parameters or transformed parameter values.\n", "\n", "In this example, each draw consists of the N-length array of replicate of the `bernoulli` model's input variable `y`, which is an N-length array of Bernoulli outcomes." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "print(new_quantities.draws().shape, new_quantities.column_names)\n", "for i in range(3):\n", " print (new_quantities.draws()[i,:])" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "We can also use ``draws_pd(inc_sample=True)`` to get a pandas DataFrame which combines the input drawset with the generated quantities." - ], - "metadata": {} + ] }, { "cell_type": "code", "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ "sample_plus = new_quantities.draws_pd(inc_sample=True)\n", "print(type(sample_plus),sample_plus.shape)\n", "names = list(sample_plus.columns.values[7:18])\n", "sample_plus.iloc[0:3, :]" - ], - "outputs": [], - "metadata": {} + ] }, { "cell_type": "markdown", + "metadata": {}, "source": [ "For models as simple as the bernoulli models here, it would be trivial to re-run the sampler and generate a new sample which contains both the estimate of the parameters `theta` as well as `y_rep` values. For models which are difficult to fit, i.e., when producing a sample is computationally expensive, the `generate_quantities` method is preferred." - ], - "metadata": {} + ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3.9.5 ('stan')", "language": "python", "name": "python3" }, @@ -212,8 +212,13 @@ "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.5" + }, + "vscode": { + "interpreter": { + "hash": "8765ce46b013071999fc1966b52035a7309a0da7551e066cc0f0fa23e83d4f60" + } } }, "nbformat": 4, "nbformat_minor": 4 -} \ No newline at end of file +} diff --git a/requirements-test.txt b/requirements-test.txt index de7b489b..2506c7c1 100644 --- a/requirements-test.txt +++ b/requirements-test.txt @@ -7,4 +7,3 @@ mypy testfixtures tqdm xarray -types-ujson diff --git a/requirements.txt b/requirements.txt index 3bab93f2..3e3bfd45 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,4 +1,3 @@ pandas numpy>=1.21 -ujson tqdm diff --git a/test/data/data-test.stan b/test/data/data-test.stan new file mode 100644 index 00000000..bcdc3efd --- /dev/null +++ b/test/data/data-test.stan @@ -0,0 +1,12 @@ +data { + real inf; + real nan; +} + + +generated quantities { + print(inf); + print(nan); + real inf_out = inf; + real nan_out = nan; +} diff --git a/test/test_sample.py b/test/test_sample.py index 03246aa2..ff9eded3 100644 --- a/test/test_sample.py +++ b/test/test_sample.py @@ -2,6 +2,7 @@ import contextlib import io +import json import logging import os import platform @@ -16,11 +17,6 @@ import numpy as np from testfixtures import LogCapture, StringComparison -try: - import ujson as json -except ImportError: - import json - import cmdstanpy.stanfit from cmdstanpy import _TMPDIR from cmdstanpy.cmdstan_args import CmdStanArgs, Method, SamplerArgs @@ -1919,6 +1915,19 @@ def test_timeout(self): with self.assertRaises(TimeoutError): timeout_model.sample(timeout=0.1, chains=1, data={'loop': 1}) + def test_json_edges(self): + stan = os.path.join(DATAFILES_PATH, 'data-test.stan') + data_model = CmdStanModel(stan_file=stan) + data = {"inf": float("inf"), "nan": float("NaN")} + fit = data_model.sample(data, chains=1, iter_warmup=1, iter_sampling=1) + self.assertTrue(np.isnan(fit.stan_variable("nan_out")[0])) + self.assertTrue(np.isinf(fit.stan_variable("inf_out")[0])) + + data = {"inf": np.inf, "nan": np.nan} + fit = data_model.sample(data, chains=1, iter_warmup=1, iter_sampling=1) + self.assertTrue(np.isnan(fit.stan_variable("nan_out")[0])) + self.assertTrue(np.isinf(fit.stan_variable("inf_out")[0])) + if __name__ == '__main__': unittest.main() diff --git a/test/test_utils.py b/test/test_utils.py index ef6efb13..c276fa94 100644 --- a/test/test_utils.py +++ b/test/test_utils.py @@ -252,8 +252,8 @@ def cmp(d1, d2): data_2 = d2[k] if isinstance(data_2, collections.abc.Collection): data_2 = np.asarray(data_2).tolist() - - self.assertEqual(data_1, data_2) + # np properly handles NaN equality + np.testing.assert_equal(data_1, data_2) dict_list = {'a': [1.0, 2.0, 3.0]} file_list = os.path.join(_TMPDIR, 'list.json') @@ -345,11 +345,14 @@ def cmp(d1, d2): ] ) } - dict_inf_nan_exp = {'a': [["-inf", "+inf", "NaN"]] * 4} + dict_inf_nan_exp = {'a': [[-np.inf, np.inf, np.nan]] * 4} file_fin = os.path.join(_TMPDIR, 'inf.json') write_stan_json(file_fin, dict_inf_nan) with open(file_fin) as fd: - cmp(json.load(fd), dict_inf_nan_exp) + cmp( + json.load(fd), + dict_inf_nan_exp, + ) dict_complex = {'a': np.array([np.complex64(3), 3 + 4j])} dict_complex_exp = {'a': [[3, 0], [3, 4]]}