Skip to content

Commit 1aec1c7

Browse files
authored
Auto-detect bibfile encoding (#1664)
Fixes #335 This commit vendors `charset-normalizer` package and uses it to detect encoding of loaded bib files to extend support beyond plain utf-8.
1 parent d0b1374 commit 1aec1c7

File tree

12 files changed

+4757
-95
lines changed

12 files changed

+4757
-95
lines changed

plugins/bibliography/new_bibliography.py

Lines changed: 54 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import sublime
44
import traceback
55

6+
from ...vendor.charset_normalizer import from_bytes as charset_from_bytes
7+
68
from ...latextools.latextools_plugin import LaTeXToolsPlugin
79
from ...latextools.utils import bibcache
810
from ...latextools.utils.logging import logger
@@ -122,46 +124,62 @@ def get_entries(self, *bib_files):
122124
pass
123125

124126
try:
125-
with open(bibfname, "r", encoding="utf-8", errors="ignore", newline="\n") as bibf:
126-
bib_entries = []
127-
128-
excluded_types = ("xdata", "comment", "string")
129-
excluded_fields = (
130-
"abstract",
131-
"annotation",
132-
"annote",
133-
"execute",
134-
"langidopts",
135-
"options",
136-
)
137-
138-
for key, entry in parser.parse(bibf.read()).items():
139-
if entry.entry_type in excluded_types:
140-
continue
141-
142-
# purge some unnecessary fields from the bib entry to save
143-
# some space and time reloading
144-
for k in excluded_fields:
145-
if k in entry:
146-
del entry[k]
147-
148-
bib_entries.append(EntryWrapper(entry))
149-
150-
logger.info(f"Loaded {len(bib_entries)} bibitems")
151-
152-
try:
153-
fmt_entries = bib_cache.set(bib_entries)
154-
entries.extend(fmt_entries)
155-
except Exception:
156-
traceback.print_exc()
157-
logger.warning("Using bibliography without caching it")
158-
entries.extend(bib_entries)
159-
127+
with open(bibfname, "rb") as bibf:
128+
content = bibf.read()
160129
except OSError:
161130
msg = f'Cannot open bibliography file "{bibfname}"!'
162131
logger.error(msg)
163132
sublime.status_message(msg)
164-
continue
133+
else:
134+
bib_entries = []
135+
136+
excluded_types = ("xdata", "comment", "string")
137+
excluded_fields = (
138+
"abstract",
139+
"annotation",
140+
"annote",
141+
"execute",
142+
"langidopts",
143+
"options",
144+
)
145+
146+
# detect encoding
147+
charset_match = charset_from_bytes(content).best()
148+
if not charset_match:
149+
msg = f'Cannot determine encoding of file "{bibfname}"!'
150+
logger.error(msg)
151+
sublime.status_message(msg)
152+
continue
153+
encoding = charset_match.encoding
154+
if charset_match.bom and encoding == "utf_8":
155+
content = content[len(codecs.BOM_UTF8):]
156+
157+
# decode bytes
158+
text = content.decode(encoding=encoding)
159+
text = text.replace("\r\n", "\n").replace("\r", "\n")
160+
161+
# parse text
162+
for key, entry in parser.parse(text).items():
163+
if entry.entry_type in excluded_types:
164+
continue
165+
166+
# purge some unnecessary fields from the bib entry to save
167+
# some space and time reloading
168+
for k in excluded_fields:
169+
if k in entry:
170+
del entry[k]
171+
172+
bib_entries.append(EntryWrapper(entry))
173+
174+
logger.info(f"Loaded {len(bib_entries)} bibitems")
175+
176+
try:
177+
fmt_entries = bib_cache.set(bib_entries)
178+
entries.extend(fmt_entries)
179+
except Exception:
180+
traceback.print_exc()
181+
logger.warning("Using bibliography without caching it")
182+
entries.extend(bib_entries)
165183

166184
logger.info(f"Found {len(entries)} total bib entries")
167185

plugins/bibliography/traditional_bibliography.py

Lines changed: 79 additions & 59 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
import sublime
44
import traceback
55

6+
from ...vendor.charset_normalizer import from_bytes as charset_from_bytes
7+
68
from ...latextools.latextools_plugin import LaTeXToolsPlugin
79
from ...latextools.utils import bibcache
810
from ...latextools.utils.logging import logger
@@ -53,69 +55,87 @@ def get_entries(self, *bib_files):
5355
pass
5456

5557
try:
56-
with open(bibfname, "r", encoding="utf-8", errors="ignore", newline="\n") as bibf:
57-
bib_entries = []
58-
entry = {}
59-
for line in bibf.readlines():
60-
line = line.strip()
61-
# Let's get rid of irrelevant lines first
62-
if line == "" or line[0] == "%":
63-
continue
64-
if line.lower()[0:8] == "@comment":
65-
continue
66-
if line.lower()[0:7] == "@string":
67-
continue
68-
if line.lower()[0:9] == "@preamble":
69-
continue
70-
if line[0] == "@":
71-
if "keyword" in entry:
72-
bib_entries.append(entry)
73-
entry = {}
74-
75-
kp_match = kp.search(line)
76-
if kp_match:
77-
entry["keyword"] = kp_match.group(1)
78-
else:
79-
logger.error(f"Cannot process this @ line: {line}")
80-
logger.error(
81-
"Previous keyword (if any): " + entry.get("keyword", ""),
82-
)
83-
continue
84-
85-
# Now test for title, author, etc.
86-
# Note: we capture only the first line, but that's OK for our purposes
87-
multip_match = multip.search(line)
88-
if multip_match:
89-
key = multip_match.group(1).lower()
90-
value = codecs.decode(multip_match.group(2), "latex")
91-
92-
if key == "title":
93-
value = (
94-
value.replace("{\\textquoteright}", "")
95-
.replace("{", "")
96-
.replace("}", "")
97-
)
98-
entry[key] = value
99-
100-
# at the end, we have a single record
101-
if "keyword" in entry:
102-
bib_entries.append(entry)
103-
104-
logger.info(f"Loaded {len(bib_entries)} bibitems")
105-
106-
try:
107-
fmt_entries = bib_cache.set(bib_entries)
108-
entries.extend(fmt_entries)
109-
except Exception:
110-
traceback.print_exc()
111-
logger.warning("Using bibliography without caching it")
112-
entries.extend(bib_entries)
113-
58+
with open(bibfname, "rb") as bibf:
59+
content = bibf.read()
11460
except OSError:
11561
msg = f'Cannot open bibliography file "{bibfname}"!'
11662
logger.error(msg)
11763
sublime.status_message(msg)
118-
continue
64+
else:
65+
bib_entries = []
66+
entry = {}
67+
68+
# detect encoding
69+
charset_match = charset_from_bytes(content).best()
70+
if not charset_match:
71+
msg = f'Cannot determine encoding of file "{bibfname}"!'
72+
logger.error(msg)
73+
sublime.status_message(msg)
74+
continue
75+
encoding = charset_match.encoding
76+
if charset_match.bom and encoding == "utf_8":
77+
content = content[len(codecs.BOM_UTF8):]
78+
79+
# decode bytes
80+
text = content.decode(encoding=encoding)
81+
text = text.replace("\r\n", "\n").replace("\r", "\n")
82+
83+
# parse text
84+
for line in text.splitlines():
85+
line = line.strip()
86+
# Let's get rid of irrelevant lines first
87+
if line == "" or line[0] == "%":
88+
continue
89+
if line.lower()[0:8] == "@comment":
90+
continue
91+
if line.lower()[0:7] == "@string":
92+
continue
93+
if line.lower()[0:9] == "@preamble":
94+
continue
95+
if line[0] == "@":
96+
if "keyword" in entry:
97+
bib_entries.append(entry)
98+
entry = {}
99+
100+
kp_match = kp.search(line)
101+
if kp_match:
102+
entry["keyword"] = kp_match.group(1)
103+
else:
104+
logger.error(f"Cannot process this @ line: {line}")
105+
logger.error(
106+
"Previous keyword (if any): " + entry.get("keyword", ""),
107+
)
108+
continue
109+
110+
# Now test for title, author, etc.
111+
# Note: we capture only the first line, but that's OK for our purposes
112+
multip_match = multip.search(line)
113+
if multip_match:
114+
key = multip_match.group(1).lower()
115+
value = codecs.decode(multip_match.group(2), "latex")
116+
117+
if key == "title":
118+
value = (
119+
value.replace("{\\textquoteright}", "")
120+
.replace("{", "")
121+
.replace("}", "")
122+
)
123+
entry[key] = value
124+
125+
# at the end, we have a single record
126+
if "keyword" in entry:
127+
bib_entries.append(entry)
128+
129+
logger.info(f"Loaded {len(bib_entries)} bibitems")
130+
131+
try:
132+
fmt_entries = bib_cache.set(bib_entries)
133+
entries.extend(fmt_entries)
134+
except Exception:
135+
traceback.print_exc()
136+
logger.warning("Using bibliography without caching it")
137+
entries.extend(bib_entries)
138+
119139

120140
logger.info(f"Found {len(entries)} total bib entries")
121141

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
"""
2+
Charset-Normalizer
3+
~~~~~~~~~~~~~~
4+
The Real First Universal Charset Detector.
5+
A library that helps you read text from an unknown charset encoding.
6+
Motivated by chardet, This package is trying to resolve the issue by taking a new approach.
7+
All IANA character set names for which the Python core library provides codecs are supported.
8+
9+
Basic usage:
10+
>>> from charset_normalizer import from_bytes
11+
>>> results = from_bytes('Bсеки човек има право на образование. Oбразованието!'.encode('utf_8'))
12+
>>> best_guess = results.best()
13+
>>> str(best_guess)
14+
'Bсеки човек има право на образование. Oбразованието!'
15+
16+
Others methods and usages are available - see the full documentation
17+
at <https://github.com/Ousret/charset_normalizer>.
18+
:copyright: (c) 2021 by Ahmed TAHRI
19+
:license: MIT, see LICENSE for more details.
20+
"""
21+
22+
from __future__ import annotations
23+
24+
import logging
25+
26+
from .api import from_bytes, from_fp, from_path, is_binary
27+
from .legacy import detect
28+
from .models import CharsetMatch, CharsetMatches
29+
from .utils import set_logging_handler
30+
from .version import VERSION, __version__
31+
32+
__all__ = (
33+
"from_fp",
34+
"from_path",
35+
"from_bytes",
36+
"is_binary",
37+
"detect",
38+
"CharsetMatch",
39+
"CharsetMatches",
40+
"__version__",
41+
"VERSION",
42+
"set_logging_handler",
43+
)
44+
45+
# Attach a NullHandler to the top level logger by default
46+
# https://docs.python.org/3.3/howto/logging.html#configuring-logging-for-a-library
47+
48+
logging.getLogger("charset_normalizer").addHandler(logging.NullHandler())

0 commit comments

Comments
 (0)