WIP: Guess encoding if default does not work #114

Closed
mslw wants to merge 5 commits from encoding into main
2 changed files with 64 additions and 10 deletions

View file

@ -10,6 +10,8 @@ from typing import (
List, List,
) )
from charset_normalizer import from_path as cs_from_path
from .load_utils import ( from .load_utils import (
_assign_context, _assign_context,
_compact_obj, _compact_obj,
@ -28,6 +30,7 @@ def load_tabby(
jsonld: bool = True, jsonld: bool = True,
recursive: bool = True, recursive: bool = True,
cpaths: List | None = None, cpaths: List | None = None,
encoding: str | None = None,
) -> Dict | List: ) -> Dict | List:
"""Load a tabby (TSV) record as structured (JSON(-LD)) data """Load a tabby (TSV) record as structured (JSON(-LD)) data
@ -48,11 +51,14 @@ def load_tabby(
With the ``jsonld`` flag, a declared or default JSON-LD context is With the ``jsonld`` flag, a declared or default JSON-LD context is
loaded and inserted into the record. loaded and inserted into the record.
Encoding used when reading tsv files can be specified as ``encoding``.
""" """
ldr = _TabbyLoader( ldr = _TabbyLoader(
jsonld=jsonld, jsonld=jsonld,
recursive=recursive, recursive=recursive,
cpaths=cpaths, cpaths=cpaths,
encoding=encoding,
) )
return ldr(src=src, single=single) return ldr(src=src, single=single)
@ -63,6 +69,7 @@ class _TabbyLoader:
jsonld: bool = True, jsonld: bool = True,
recursive: bool = True, recursive: bool = True,
cpaths: List[Path] | None = None, cpaths: List[Path] | None = None,
encoding: str | None = None,
): ):
std_convention_path = Path(__file__).parent / 'conventions' std_convention_path = Path(__file__).parent / 'conventions'
if cpaths is None: if cpaths is None:
@ -70,6 +77,7 @@ class _TabbyLoader:
else: else:
cpaths.append(std_convention_path) cpaths.append(std_convention_path)
self._cpaths = cpaths self._cpaths = cpaths
self._encoding = encoding
self._jsonld = jsonld self._jsonld = jsonld
self._recursive = recursive self._recursive = recursive
@ -94,8 +102,24 @@ class _TabbyLoader:
src=src, src=src,
trace=trace, trace=trace,
) )
if self._encoding is not None:
tsv_obj = self._parse_tsv_single(src, encoding=self._encoding)
else:
try:
tsv_obj = self._parse_tsv_single(src)
except UnicodeDecodeError:
# by default Path.open() uses locale.getencoding()
# that didn't work, try guessing
encoding = cs_from_path(src).best().encoding
tsv_obj = self._parse_tsv_single(src, encoding=encoding)
with src.open(newline='') as tsvfile: obj.update(tsv_obj)
return self._postproc_obj(obj, src=src, trace=trace)
def _parse_tsv_single(self, src: Path, encoding: str | None = None) -> Dict:
obj = {}
with src.open(newline='', encoding=encoding) as tsvfile:
reader = csv.reader(tsvfile, delimiter='\t') reader = csv.reader(tsvfile, delimiter='\t')
# row_id is useful for error reporting # row_id is useful for error reporting
for row_id, row in enumerate(reader): for row_id, row in enumerate(reader):
@ -117,8 +141,7 @@ class _TabbyLoader:
# we support "sequence" values via multi-column values # we support "sequence" values via multi-column values
# supporting two ways just adds unnecessary complexity # supporting two ways just adds unnecessary complexity
obj[key] = val obj[key] = val
return obj
return self._postproc_obj(obj, src=src, trace=trace)
def _load_many( def _load_many(
self, self,
@ -144,19 +167,49 @@ class _TabbyLoader:
# the table field/column names have purposefully _nothing_ # the table field/column names have purposefully _nothing_
# to do with any possibly loaded JSON data # to do with any possibly loaded JSON data
fieldnames = None if self._encoding is not None:
tsv_array = self._parse_tsv_many(
src, obj_tmpl, trace=trace, fieldnames=None, encoding=self._encoding
)
else:
try:
tsv_array = self._parse_tsv_many(
src, obj_tmpl, trace=trace, fieldnames=None
)
except UnicodeDecodeError:
# by default Path.open() uses locale.getencoding()
# that didn't work, try guessing
encoding = cs_from_path(src).best().encoding
tsv_array = self._parse_tsv_many(
src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding
)
with src.open(newline='') as tsvfile: array.extend(tsv_array)
return array
def _parse_tsv_many(
self,
src: Path,
obj_tmpl: Dict,
trace: List,
fieldnames: List | None = None,
encoding: str | None = None,
) -> List[Dict]:
array = []
with src.open(newline="", encoding=encoding) as tsvfile:
# we cannot use DictReader -- we need to support identically named # we cannot use DictReader -- we need to support identically named
# columns # columns
reader = csv.reader(tsvfile, delimiter='\t') reader = csv.reader(tsvfile, delimiter="\t")
# row_id is useful for error reporting # row_id is useful for error reporting
for row_id, row in enumerate(reader): for row_id, row in enumerate(reader):
# row is a list of field, with only as many items # row is a list of field, with only as many items
# as this particular row has columns # as this particular row has columns
if not len(row) \ if (
or row[0].startswith('#') \ not len(row)
or all(v is None for v in row): or row[0].startswith("#")
or all(v is None for v in row)
):
# skip empty rows, rows with no key, or rows with # skip empty rows, rows with no key, or rows with
# a comment key # a comment key
continue continue

View file

@ -17,6 +17,7 @@ install_requires =
datalad >= 0.18.0 datalad >= 0.18.0
datalad-next @ git+https://github.com/datalad/datalad-next.git@main datalad-next @ git+https://github.com/datalad/datalad-next.git@main
datalad-metalad datalad-metalad
charset-normalizer
openpyxl openpyxl
pyld pyld
packages = find_namespace: packages = find_namespace: