Skip to content

Commit 8794ffa

Browse files
DanielYang59shyuep
andauthored
Replace pybtex with bibtexparser (#4361)
* partial migrate * change dep * recover TODO * recover some comments * remove pybtex check * avoid import pybtext for now * (need double check) migrate is_valid_bibtex * lower version of bibtexparser seem to throw error for test_provenance * uv pip compile pyproject.toml -o requirements.txt * mark implicit text mode as error * migrate cif bibtex str getter --------- Co-authored-by: Shyue Ping Ong <[email protected]>
1 parent 589b6b2 commit 8794ffa

File tree

6 files changed

+96
-110
lines changed

6 files changed

+96
-110
lines changed

pyproject.toml

+2-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,7 @@ dependencies = [
6161
"palettable>=3.3.3",
6262
"pandas>=2",
6363
"plotly>=5.0.0",
64-
"pybtex>=0.24.0",
64+
"bibtexparser>=1.4.0",
6565
"requests>=2.32",
6666
"ruamel.yaml>=0.17.0",
6767
"scipy>=1.13.0",
@@ -260,6 +260,7 @@ filterwarnings = [
260260
# NOTE: the LAST matching option would be used
261261
"ignore::UserWarning", # Ignore UserWarning
262262
"error:We strongly encourage explicit `encoding`:EncodingWarning", # Mark `zopen` EncodingWarning as error
263+
"error:We strongly discourage using implicit binary/text:FutureWarning", # Mark `zopen` FutureWarning as error
263264
# TODO: remove the following filter once `monty.io` dropped custom EncodingWarning
264265
"error:We strongly encourage explicit `encoding`:monty.io.EncodingWarning",
265266
# TODO: pybtex (perhaps some others) emits the following warnings

requirements.txt

+51-52
Original file line numberDiff line numberDiff line change
@@ -1,88 +1,87 @@
11
# This file was autogenerated by uv via the following command:
22
# uv pip compile pyproject.toml -o requirements.txt
3+
bibtexparser==1.4.3
4+
# via pymatgen (pyproject.toml)
35
certifi==2024.8.30
4-
# via requests
6+
# via requests
57
charset-normalizer==3.4.0
6-
# via requests
8+
# via requests
79
contourpy==1.3.1
8-
# via matplotlib
10+
# via matplotlib
911
cycler==0.12.1
10-
# via matplotlib
12+
# via matplotlib
1113
fonttools==4.55.0
12-
# via matplotlib
14+
# via matplotlib
1315
idna==3.10
14-
# via requests
16+
# via requests
1517
joblib==1.4.2
16-
# via pymatgen (pyproject.toml)
18+
# via pymatgen (pyproject.toml)
1719
kiwisolver==1.4.7
18-
# via matplotlib
19-
latexcodec==3.0.0
20-
# via pybtex
20+
# via matplotlib
2121
matplotlib==3.9.2
22-
# via pymatgen (pyproject.toml)
22+
# via pymatgen (pyproject.toml)
2323
monty==2025.1.9
24-
# via pymatgen (pyproject.toml)
24+
# via pymatgen (pyproject.toml)
2525
mpmath==1.3.0
26-
# via sympy
26+
# via sympy
2727
networkx==3.4.2
28-
# via pymatgen (pyproject.toml)
28+
# via pymatgen (pyproject.toml)
2929
numpy==2.0.0
30-
# via
31-
# pymatgen (pyproject.toml)
32-
# contourpy
33-
# matplotlib
34-
# pandas
35-
# scipy
36-
# spglib
30+
# via
31+
# pymatgen (pyproject.toml)
32+
# contourpy
33+
# matplotlib
34+
# monty
35+
# pandas
36+
# scipy
37+
# spglib
3738
packaging==24.2
38-
# via
39-
# matplotlib
40-
# plotly
39+
# via
40+
# matplotlib
41+
# plotly
4142
palettable==3.3.3
42-
# via pymatgen (pyproject.toml)
43+
# via pymatgen (pyproject.toml)
4344
pandas==2.2.3
44-
# via pymatgen (pyproject.toml)
45+
# via pymatgen (pyproject.toml)
4546
pillow==11.0.0
46-
# via matplotlib
47+
# via matplotlib
4748
plotly==5.24.1
48-
# via pymatgen (pyproject.toml)
49-
pybtex==0.24.0
50-
# via pymatgen (pyproject.toml)
49+
# via pymatgen (pyproject.toml)
5150
pyparsing==3.2.0
52-
# via matplotlib
51+
# via
52+
# bibtexparser
53+
# matplotlib
5354
python-dateutil==2.9.0.post0
54-
# via
55-
# matplotlib
56-
# pandas
55+
# via
56+
# matplotlib
57+
# pandas
5758
pytz==2024.2
58-
# via pandas
59-
pyyaml==6.0.2
60-
# via pybtex
59+
# via pandas
6160
requests==2.32.3
62-
# via pymatgen (pyproject.toml)
61+
# via pymatgen (pyproject.toml)
6362
ruamel-yaml==0.18.10
64-
# via pymatgen (pyproject.toml)
63+
# via
64+
# pymatgen (pyproject.toml)
65+
# monty
6566
ruamel-yaml-clib==0.2.12
66-
# via ruamel-yaml
67+
# via ruamel-yaml
6768
scipy==1.14.1
68-
# via pymatgen (pyproject.toml)
69+
# via pymatgen (pyproject.toml)
6970
six==1.16.0
70-
# via
71-
# pybtex
72-
# python-dateutil
71+
# via python-dateutil
7372
spglib==2.5.0
74-
# via pymatgen (pyproject.toml)
73+
# via pymatgen (pyproject.toml)
7574
sympy==1.13.1
76-
# via pymatgen (pyproject.toml)
75+
# via pymatgen (pyproject.toml)
7776
tabulate==0.9.0
78-
# via pymatgen (pyproject.toml)
77+
# via pymatgen (pyproject.toml)
7978
tenacity==9.0.0
80-
# via plotly
79+
# via plotly
8180
tqdm==4.67.1
82-
# via pymatgen (pyproject.toml)
81+
# via pymatgen (pyproject.toml)
8382
tzdata==2024.2
84-
# via pandas
83+
# via pandas
8584
uncertainties==3.2.2
86-
# via pymatgen (pyproject.toml)
85+
# via pymatgen (pyproject.toml)
8786
urllib3==2.2.3
88-
# via requests
87+
# via requests

src/pymatgen/io/cif.py

+29-29
Original file line numberDiff line numberDiff line change
@@ -1349,16 +1349,15 @@ def get_structures(self, *args, **kwargs) -> list[Structure]:
13491349
def get_bibtex_string(self) -> str:
13501350
"""Get BibTeX reference from CIF file.
13511351
1352-
Args:
1353-
data:
1352+
TODO:
1353+
- parse '_publ_section_references' when it exists?
1354+
- CIF specification supports multiple citations.
13541355
13551356
Returns:
13561357
BibTeX string.
13571358
"""
1358-
try:
1359-
from pybtex.database import BibliographyData, Entry
1360-
except ImportError:
1361-
raise RuntimeError("Bibliographic data extraction requires pybtex.")
1359+
from bibtexparser.bibdatabase import BibDatabase
1360+
from bibtexparser.bwriter import BibTexWriter
13621361

13631362
bibtex_keys: dict[str, tuple[str, ...]] = {
13641363
"author": ("_publ_author_name", "_citation_author_name"),
@@ -1377,44 +1376,45 @@ def get_bibtex_string(self) -> str:
13771376
"doi": ("_journal_DOI", "_citation_DOI"),
13781377
}
13791378

1380-
entries: dict[str, Entry] = {}
1381-
1382-
# TODO: parse '_publ_section_references' when it exists?
1383-
# TODO: CIF specification supports multiple citations.
1379+
db = BibDatabase()
1380+
db.entries = []
13841381

13851382
for idx, data in enumerate(self._cif.data.values()):
13861383
# Convert to lower-case keys, some CIF files inconsistent
13871384
_data = {k.lower(): v for k, v in data.data.items()}
1388-
1389-
bibtex_entry = {}
1385+
entry = {"ENTRYTYPE": "article", "ID": f"cifref{idx}"}
13901386

13911387
for field, tags in bibtex_keys.items():
13921388
for tag in tags:
13931389
if tag in _data:
1394-
if isinstance(_data[tag], list):
1395-
bibtex_entry[field] = _data[tag][0]
1396-
else:
1397-
bibtex_entry[field] = _data[tag]
1390+
value = _data[tag]
1391+
entry[field] = value[0] if isinstance(value, list) else value
1392+
break
13981393

13991394
# Convert to bibtex author format ("and" delimited)
1400-
if "author" in bibtex_entry:
1395+
if "author" in entry:
14011396
# Separate out semicolon authors
1402-
if isinstance(bibtex_entry["author"], str) and ";" in bibtex_entry["author"]:
1403-
bibtex_entry["author"] = bibtex_entry["author"].split(";")
1404-
1405-
if isinstance(bibtex_entry["author"], list):
1406-
bibtex_entry["author"] = " and ".join(bibtex_entry["author"])
1397+
if isinstance(entry["author"], str) and ";" in entry["author"]:
1398+
entry["author"] = entry["author"].split(";")
1399+
if isinstance(entry["author"], list):
1400+
entry["author"] = " and ".join(entry["author"])
14071401

14081402
# Convert to bibtex page range format, use empty string if not specified
1409-
if ("page_first" in bibtex_entry) or ("page_last" in bibtex_entry):
1410-
bibtex_entry["pages"] = bibtex_entry.get("page_first", "") + "--" + bibtex_entry.get("page_last", "")
1411-
bibtex_entry.pop("page_first", None) # and remove page_first, page_list if present
1412-
bibtex_entry.pop("page_last", None)
1403+
if "page_first" in entry or "page_last" in entry:
1404+
entry["pages"] = f"{entry.get('page_first', '')}--{entry.get('page_last', '')}"
1405+
entry.pop("page_first", None) # and remove page_first, page_list if present
1406+
entry.pop("page_last", None)
1407+
1408+
db.entries.append(entry)
14131409

1414-
# Cite keys are given as cif-reference-idx in order they are found
1415-
entries[f"cifref{idx}"] = Entry("article", list(bibtex_entry.items()))
1410+
# NOTE: the following is added to make output consistent with
1411+
# previous pybtex implementation
1412+
writer = BibTexWriter()
1413+
writer.indent = " "
1414+
writer.display_order = ("author", "title", "journal", "volume", "year", "pages")
14161415

1417-
return BibliographyData(entries).to_string(bib_format="bibtex")
1416+
# Replace curly brackets with double quotes (skip the first and last one)
1417+
return re.sub(r"(^\s*\w+\s*=\s*)\{([^{}]*)\}", r'\1"\2"', writer.write(db), flags=re.MULTILINE)
14181418

14191419
def as_dict(self) -> dict:
14201420
"""MSONable dict."""

src/pymatgen/util/provenance.py

+12-18
Original file line numberDiff line numberDiff line change
@@ -6,19 +6,12 @@
66
import re
77
import sys
88
from datetime import datetime, timezone
9-
from io import StringIO
109
from typing import TYPE_CHECKING, NamedTuple
1110

1211
from monty.json import MontyDecoder, MontyEncoder
1312

1413
from pymatgen.core.structure import Molecule, Structure
1514

16-
try:
17-
from pybtex import errors
18-
from pybtex.database.input import bibtex
19-
except ImportError:
20-
pybtex = bibtex = errors = None
21-
2215
if TYPE_CHECKING:
2316
from collections.abc import Sequence
2417
from typing import Any
@@ -37,21 +30,22 @@
3730

3831

3932
def is_valid_bibtex(reference: str) -> bool:
40-
"""Use pybtex to validate that a reference is in proper BibTeX format.
33+
"""Validate that a reference is in proper BibTeX format.
4134
4235
Args:
4336
reference (str): Reference in BibTeX format.
4437
4538
Returns:
4639
bool: True if reference is valid BibTeX.
4740
"""
48-
# str is necessary since pybtex seems to have an issue with unicode.
49-
# The filter expression removes all non-ASCII characters.
50-
str_io = StringIO(reference.encode("ascii", "ignore").decode("ascii"))
51-
parser = bibtex.Parser()
52-
errors.set_strict_mode(enable=False)
53-
bib_data = parser.parse_stream(str_io)
54-
return len(bib_data.entries) > 0
41+
from bibtexparser.bparser import BibTexParser
42+
43+
parser = BibTexParser()
44+
try:
45+
bib_database = parser.parse(reference)
46+
return bool(bib_database.entries)
47+
except Exception:
48+
return False
5549

5650

5751
class HistoryNode(NamedTuple):
@@ -213,14 +207,14 @@ def __init__(
213207
self.structure = struct_or_mol
214208

215209
# Turn `authors` into list of `Author` objects
216-
authors = authors.split(",") if isinstance(authors, str) else authors
217-
self.authors = [Author.parse_author(a) for a in authors]
210+
_authors: list = authors.split(",") if isinstance(authors, str) else authors
211+
self.authors: list[Author] = [Author.parse_author(a) for a in _authors]
218212

219213
# Turn `projects` into list of strings
220214
projects = projects or []
221215
self.projects: list[str] = [projects] if isinstance(projects, str) else projects
222216

223-
# Check that references are valid BibTeX
217+
# Check that references are valid BibTeX string
224218
if not isinstance(references, str):
225219
raise TypeError("Invalid format for SNL reference! Should be empty string or BibTeX string.")
226220

tests/io/test_cif.py

+1-8
Original file line numberDiff line numberDiff line change
@@ -12,12 +12,6 @@
1212
from pymatgen.symmetry.structure import SymmetrizedStructure
1313
from pymatgen.util.testing import TEST_FILES_DIR, VASP_IN_DIR, MatSciTest
1414

15-
try:
16-
import pybtex
17-
except ImportError:
18-
pybtex = None
19-
20-
2115
MCIF_TEST_DIR = f"{TEST_FILES_DIR}/io/cif/mcif"
2216

2317

@@ -926,7 +920,7 @@ def test_cif_writer_write_file(self):
926920

927921
# test write_file append mode='a'
928922
struct2 = Structure.from_file(f"{TEST_FILES_DIR}/cif/Graphite.cif")
929-
CifWriter(struct2).write_file(out_path, mode="a")
923+
CifWriter(struct2).write_file(out_path, mode="at")
930924

931925
read_structs = CifParser(out_path).parse_structures()
932926
assert len(read_structs) == 2
@@ -1139,7 +1133,6 @@ def test_write(self):
11391133
cw = CifWriter(s_manual, write_magmoms=True)
11401134
assert str(cw) == cw_manual_oxi_string
11411135

1142-
@pytest.mark.skipif(pybtex is None, reason="pybtex not present")
11431136
def test_bibtex(self):
11441137
ref_bibtex_string = """@article{cifref0,
11451138
author = "Blanco, J.A.",

tests/util/test_provenance.py

+1-2
Original file line numberDiff line numberDiff line change
@@ -45,8 +45,7 @@ def setup_method(self):
4545
".com/retrieve/pii/S0927025612006295},\n volume = {68},"
4646
"\n year = {2013}\n}"
4747
)
48-
repeat = "REPEAT" * 10000
49-
self.superlong = f"@misc{{SuperLong,\ntitle = {{{repeat}}}}}"
48+
self.superlong = f"@misc{{SuperLong,\ntitle = {{{'REPEAT' * 10000}}}}}"
5049
self.unicode_title = "@misc{Unicode_Title,\ntitle = {{A \u73ab is a rose}}}"
5150
self.junk = "This is junk text, not a BibTeX reference"
5251

0 commit comments

Comments
 (0)