Skip to content

Commit cc25975

Browse files
schlenkMichael Schlenkerjkowalleck
authored
Handle misencoded license text files graceful. (#884)
--------- Signed-off-by: Michael Schlenker <michael.schlenker@contact-software.com> Signed-off-by: Jan Kowalleck <jan.kowalleck@gmail.com> Co-authored-by: Michael Schlenker <michael.schlenker@contact-software.com> Co-authored-by: Jan Kowalleck <jan.kowalleck@gmail.com>
1 parent 9861a46 commit cc25975

File tree

62 files changed

+1241
-45
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

62 files changed

+1241
-45
lines changed

cyclonedx_py/_internal/utils/bytes.py

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,28 @@
1+
# This file is part of CycloneDX Python
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software
10+
# distributed under the License is distributed on an "AS IS" BASIS,
11+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12+
# See the License for the specific language governing permissions and
13+
# limitations under the License.
14+
#
15+
# SPDX-License-Identifier: Apache-2.0
16+
# Copyright (c) OWASP Foundation. All Rights Reserved.
17+
18+
from sys import getdefaultencoding
19+
20+
from chardet import detect as chardetect
21+
22+
23+
def bytes2str(data: bytes, *, errors: str = 'strict') -> str:
24+
# see https://docs.python.org/3/library/codecs.html#standard-encodings
25+
encoding = (chardetect(data)['encoding'] or getdefaultencoding()).replace(
26+
# replace Windows-encoding with code-page
27+
'Windows-', 'cp')
28+
return data.decode(encoding, errors)

cyclonedx_py/_internal/utils/io.py

Lines changed: 2 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -15,20 +15,14 @@
1515
# SPDX-License-Identifier: Apache-2.0
1616
# Copyright (c) OWASP Foundation. All Rights Reserved.
1717

18-
from sys import getdefaultencoding
1918
from tempfile import NamedTemporaryFile
2019
from typing import BinaryIO
2120

22-
from chardet import detect as chardetect
21+
from .bytes import bytes2str
2322

2423

2524
def io2str(io: BinaryIO, *, errors: str = 'strict') -> str:
26-
data = io.read()
27-
# see https://docs.python.org/3/library/codecs.html#standard-encodings
28-
encoding = (chardetect(data)['encoding'] or getdefaultencoding()).replace(
29-
# replace Windows-encoding with code-page
30-
'Windows-', 'cp')
31-
return data.decode(encoding, errors)
25+
return bytes2str(io.read(), errors=errors)
3226

3327

3428
def io2file(io: BinaryIO, *, errors: str = 'strict') -> str:

cyclonedx_py/_internal/utils/pep639.py

Lines changed: 19 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -30,6 +30,7 @@
3030
from cyclonedx.model import AttachedText, Encoding
3131
from cyclonedx.model.license import DisjunctiveLicense, LicenseAcknowledgement
3232

33+
from .bytes import bytes2str
3334
from .mimetypes import guess_type
3435

3536
if TYPE_CHECKING: # pragma: no cover
@@ -38,6 +39,10 @@
3839

3940
from cyclonedx.model.license import License
4041

42+
# per spec > license files are stored in the `.dist-info/licenses/` subdirectory of the produced wheel.
43+
# but in practice, other locations are used, too.
44+
_LICENSE_LOCATIONS = ('licenses', 'license_files', '')
45+
4146

4247
def dist2licenses(
4348
dist: 'Distribution',
@@ -55,12 +60,20 @@ def dist2licenses(
5560
for mlfile in set(metadata.get_all('License-File', ())):
5661
# see spec: https://peps.python.org/pep-0639/#add-license-file-field
5762
# latest spec rev: https://discuss.python.org/t/pep-639-round-3-improving-license-clarity-with-better-package-metadata/53020 # noqa: E501
58-
59-
# per spec > license files are stored in the `.dist-info/licenses/` subdirectory of the produced wheel.
60-
# but in practice, other locations are used, too.
61-
content = dist.read_text(join('licenses', mlfile)) \
62-
or dist.read_text(join('license_files', mlfile)) \
63-
or dist.read_text(mlfile)
63+
content = None
64+
for mlpath in _LICENSE_LOCATIONS:
65+
try:
66+
content = dist.read_text(join(mlpath, mlfile))
67+
except UnicodeDecodeError as err:
68+
try:
69+
content = bytes2str(err.object)
70+
except UnicodeDecodeError:
71+
pass
72+
else:
73+
break # for-loop
74+
else:
75+
if content is not None:
76+
break # for-loop
6477
if content is None: # pragma: no cover
6578
logger.debug('Error: failed to read license file %r for dist %r',
6679
mlfile, metadata['Name'])
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
# EditorConfig is awesome: https://editorconfig.org
2+
3+
[my_licenses/utf-8*]
4+
charset = utf-8
5+
6+
[my_licenses/utf-16le*]
7+
charset = utf-16le
8+
9+
[my_licenses/utf-16be*]
10+
charset = utf-16be
11+
Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,2 @@
1+
Licenses/* binary
2+
Licenses/*.txt binary diff=txt
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
# PEP 639 - regression 868
2+
3+
see <https://github.com/CycloneDX/cyclonedx-python/issues/868>
4+
5+
PEP-630 expects license gfiles to be UTF8 encoded text.
6+
some license files may not be text, some may not be UTF8 encoded, but still be added as license files.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
this file is
2+
utf-8 encoded
3+
without BOM
4+
😃

0 commit comments

Comments
 (0)