Skip to content

Commit

Permalink
Add chardet encoding detection (#193)
Browse files Browse the repository at this point in the history
  • Loading branch information
MarJMue authored Oct 2, 2024
1 parent a6e3bf0 commit e141ce0
Show file tree
Hide file tree
Showing 8 changed files with 56 additions and 17 deletions.
5 changes: 3 additions & 2 deletions docs/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,11 +14,12 @@ ipython
ipywidgets
sphinx-gallery
sphinx-plotly-directive
sphinxcontrib-mermaid
sphinxcontrib-mermaid
matplotlib
h5py
pyyaml
importlib-resources
rapidfuzz
lark>=1.1.5
pint
pint
chardet
19 changes: 10 additions & 9 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@ description = "An ellipsometry analysis tool for reproducible and comprehensible
dynamic = ["version"]
authors = [
{ name = "Marius Müller", email = "[email protected]" },
{ name = "Florian Dobener", email = "[email protected]" }
{ name = "Florian Dobener", email = "[email protected]" },
]
requires-python = ">=3.8"
license = { file = "LICENSE.txt" }
Expand All @@ -19,7 +19,7 @@ classifiers = [
"Programming Language :: Python :: 3.9",
"Programming Language :: Python :: 3.10",
"Programming Language :: Python :: 3.11",
"Programming Language :: Python :: 3.12"
"Programming Language :: Python :: 3.12",
]
dependencies = [
"scipy",
Expand All @@ -32,6 +32,7 @@ dependencies = [
"rapidfuzz",
"lark>=1.1.5",
"pint",
"chardet",
]

[project.optional-dependencies]
Expand Down Expand Up @@ -75,16 +76,16 @@ indent-width = 4

[tool.ruff.lint]
select = [
"E", # pycodestyle
"W", # pycodestyle
"PL", # pylint
"E", # pycodestyle
"W", # pycodestyle
"PL", # pylint
"NPY201", # numpy
]
ignore = [
"E501", # Line too long ({width} > {limit} characters)
"E701", # Multiple statements on one line (colon)
"E731", # Do not assign a lambda expression, use a def
"E402", # Module level import not at top of file
"E501", # Line too long ({width} > {limit} characters)
"E701", # Multiple statements on one line (colon)
"E731", # Do not assign a lambda expression, use a def
"E402", # Module level import not at top of file
"PLR0911", # Too many return statements
"PLR0912", # Too many branches
"PLR0913", # Too many arguments in function definition
Expand Down
7 changes: 6 additions & 1 deletion requirements/dev-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,12 @@ cfgv==3.4.0 \
--hash=sha256:b7265b1f29fd3316bfcd2b330d63d024f2bfd8bcb8b0272f8e19a504856c48f9 \
--hash=sha256:e52591d4c5f5dead8e0f673fb16db7949d2cfb3f7da4582893288f0ded8fe560
# via pre-commit
chardet==5.2.0 \
--hash=sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7 \
--hash=sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970
# via
# -r requirements/fitting-requirements.txt
# pyelli (pyproject.toml)
comm==0.2.2 \
--hash=sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e \
--hash=sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3
Expand Down Expand Up @@ -1278,7 +1284,6 @@ typing-extensions==4.12.2 \
# -r requirements/fitting-requirements.txt
# flexcache
# flexparser
# ipython
# pint
tzdata==2024.1 \
--hash=sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd \
Expand Down
7 changes: 6 additions & 1 deletion requirements/fitting-requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,12 @@ asttokens==2.4.1 \
--hash=sha256:051ed49c3dcae8913ea7cd08e46a606dba30b79993209636c4875bc1d637bc24 \
--hash=sha256:b03869718ba9a6eb027e134bfdf69f38a236d681c83c160d510768af11254ba0
# via stack-data
chardet==5.2.0 \
--hash=sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7 \
--hash=sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970
# via
# -r requirements/requirements.txt
# pyelli (pyproject.toml)
comm==0.2.2 \
--hash=sha256:3fd7a84065306e07bea1773df6eb8282de51ba82f77c72f9c85716ab11fe980e \
--hash=sha256:e6fb86cb70ff661ee8c9c14e7d36d6de3b4066f1441be4063df9c5009f0a64d3
Expand Down Expand Up @@ -764,7 +770,6 @@ typing-extensions==4.12.2 \
# -r requirements/requirements.txt
# flexcache
# flexparser
# ipython
# pint
tzdata==2024.1 \
--hash=sha256:2674120f8d891909751c38abcdfd386ac0a5a1127954fbc332af6b5ceae07efd \
Expand Down
4 changes: 4 additions & 0 deletions requirements/requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,10 @@ appdirs==1.4.4 \
--hash=sha256:7d5d0167b2b1ba821647616af46a749d1c653740dd0d2415100fe26e27afdf41 \
--hash=sha256:a841dacd6b99318a741b166adb07e19ee71a274450e68237b4650ca1055ab128
# via pint
chardet==5.2.0 \
--hash=sha256:1b3b6ff479a8c414bc3fa2c0852995695c4a026dcd6d0633b2dd092ca39c1cf7 \
--hash=sha256:e1cf59446890a00105fe7b7912492ea04b6e6f06d4b742b2c788469e34c82970
# via pyelli (pyproject.toml)
flexcache==0.3 \
--hash=sha256:18743bd5a0621bfe2cf8d519e4c3bfdf57a269c15d1ced3fb4b64e0ff4600656 \
--hash=sha256:d43c9fea82336af6e0115e308d9d33a185390b8346a017564611f1466dcd2e32
Expand Down
14 changes: 14 additions & 0 deletions src/elli/importer/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
import chardet


def detect_encoding(fname: str) -> str:
r"""Detects the encoding of file fname.
Args:
fname (str): Filename
Returns:
str: Encoding identifier string.
"""
with open(fname, "rb") as f:
raw_data = f.read()
result = chardet.detect(raw_data)
return result["encoding"]
12 changes: 9 additions & 3 deletions src/elli/importer/spectraray.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from packaging.version import Version, parse

from ..utils import calc_rho
from . import detect_encoding


def read_spectraray_psi_delta(
Expand All @@ -25,10 +26,13 @@ def read_spectraray_psi_delta(
pd.DataFrame: DataFrame containing the psi/delta data in
the format to be further processes inside pyElli.
"""
# detect encoding
encoding = detect_encoding(fname)

# read data and drop empty column
psi_delta_df = pd.read_csv(
fname,
encoding=encoding,
index_col=0,
header=None,
sep=sep,
Expand Down Expand Up @@ -82,9 +86,11 @@ def read_spectraray_mmatrix(
pd.DataFrame: DataFrame containing the psi/delta data in
the format to be further processes inside pyElli.
"""
mueller_matrix = pd.read_csv(fname, sep=sep, decimal=decimal, index_col=0).iloc[
:, -17:-1
]
encoding = detect_encoding(fname)

mueller_matrix = pd.read_csv(
fname, encoding=encoding, sep=sep, decimal=decimal, index_col=0
).iloc[:, -17:-1]
mueller_matrix.index.name = "Wavelength"
mueller_matrix.columns = [
"M11",
Expand Down
5 changes: 4 additions & 1 deletion src/elli/importer/woollam.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@

from ..units import ureg
from ..utils import calc_rho
from . import detect_encoding

logger = logging.getLogger(__name__)

Expand Down Expand Up @@ -167,7 +168,9 @@ def read_woollam_psi_delta(fname: str) -> pd.DataFrame:
the format to be further processes inside pyElli.
"""

with open(fname, encoding="utf-8") as fobj:
encoding = detect_encoding(fname)

with open(fname, encoding=encoding) as fobj:
line_number = fobj.tell()
metadata = []
file_format = ""
Expand Down

0 comments on commit e141ce0

Please sign in to comment.