WIP: Guess encoding if default does not work #114
2 changed files with 64 additions and 10 deletions
|
|
@ -10,6 +10,8 @@ from typing import (
|
||||||
List,
|
List,
|
||||||
)
|
)
|
||||||
|
|
||||||
|
from charset_normalizer import from_path as cs_from_path
|
||||||
|
|
||||||
from .load_utils import (
|
from .load_utils import (
|
||||||
_assign_context,
|
_assign_context,
|
||||||
_compact_obj,
|
_compact_obj,
|
||||||
|
|
@ -28,6 +30,7 @@ def load_tabby(
|
||||||
jsonld: bool = True,
|
jsonld: bool = True,
|
||||||
recursive: bool = True,
|
recursive: bool = True,
|
||||||
cpaths: List | None = None,
|
cpaths: List | None = None,
|
||||||
|
encoding: str | None = None,
|
||||||
) -> Dict | List:
|
) -> Dict | List:
|
||||||
"""Load a tabby (TSV) record as structured (JSON(-LD)) data
|
"""Load a tabby (TSV) record as structured (JSON(-LD)) data
|
||||||
|
|
||||||
|
|
@ -48,11 +51,14 @@ def load_tabby(
|
||||||
|
|
||||||
With the ``jsonld`` flag, a declared or default JSON-LD context is
|
With the ``jsonld`` flag, a declared or default JSON-LD context is
|
||||||
loaded and inserted into the record.
|
loaded and inserted into the record.
|
||||||
|
|
||||||
|
Encoding used when reading tsv files can be specified as ``encoding``.
|
||||||
"""
|
"""
|
||||||
ldr = _TabbyLoader(
|
ldr = _TabbyLoader(
|
||||||
jsonld=jsonld,
|
jsonld=jsonld,
|
||||||
recursive=recursive,
|
recursive=recursive,
|
||||||
cpaths=cpaths,
|
cpaths=cpaths,
|
||||||
|
encoding=encoding,
|
||||||
)
|
)
|
||||||
return ldr(src=src, single=single)
|
return ldr(src=src, single=single)
|
||||||
|
|
||||||
|
|
@ -63,6 +69,7 @@ class _TabbyLoader:
|
||||||
jsonld: bool = True,
|
jsonld: bool = True,
|
||||||
recursive: bool = True,
|
recursive: bool = True,
|
||||||
cpaths: List[Path] | None = None,
|
cpaths: List[Path] | None = None,
|
||||||
|
encoding: str | None = None,
|
||||||
):
|
):
|
||||||
std_convention_path = Path(__file__).parent / 'conventions'
|
std_convention_path = Path(__file__).parent / 'conventions'
|
||||||
if cpaths is None:
|
if cpaths is None:
|
||||||
|
|
@ -70,6 +77,7 @@ class _TabbyLoader:
|
||||||
else:
|
else:
|
||||||
cpaths.append(std_convention_path)
|
cpaths.append(std_convention_path)
|
||||||
self._cpaths = cpaths
|
self._cpaths = cpaths
|
||||||
|
self._encoding = encoding
|
||||||
self._jsonld = jsonld
|
self._jsonld = jsonld
|
||||||
self._recursive = recursive
|
self._recursive = recursive
|
||||||
|
|
||||||
|
|
@ -94,8 +102,24 @@ class _TabbyLoader:
|
||||||
src=src,
|
src=src,
|
||||||
trace=trace,
|
trace=trace,
|
||||||
)
|
)
|
||||||
|
if self._encoding is not None:
|
||||||
|
tsv_obj = self._parse_tsv_single(src, encoding=self._encoding)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
tsv_obj = self._parse_tsv_single(src)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# by default Path.open() uses locale.getencoding()
|
||||||
|
# that didn't work, try guessing
|
||||||
|
encoding = cs_from_path(src).best().encoding
|
||||||
|
tsv_obj = self._parse_tsv_single(src, encoding=encoding)
|
||||||
|
|
||||||
with src.open(newline='') as tsvfile:
|
obj.update(tsv_obj)
|
||||||
|
|
||||||
|
return self._postproc_obj(obj, src=src, trace=trace)
|
||||||
|
|
||||||
|
def _parse_tsv_single(self, src: Path, encoding: str | None = None) -> Dict:
|
||||||
|
obj = {}
|
||||||
|
with src.open(newline='', encoding=encoding) as tsvfile:
|
||||||
reader = csv.reader(tsvfile, delimiter='\t')
|
reader = csv.reader(tsvfile, delimiter='\t')
|
||||||
# row_id is useful for error reporting
|
# row_id is useful for error reporting
|
||||||
for row_id, row in enumerate(reader):
|
for row_id, row in enumerate(reader):
|
||||||
|
|
@ -117,8 +141,7 @@ class _TabbyLoader:
|
||||||
# we support "sequence" values via multi-column values
|
# we support "sequence" values via multi-column values
|
||||||
# supporting two ways just adds unnecessary complexity
|
# supporting two ways just adds unnecessary complexity
|
||||||
obj[key] = val
|
obj[key] = val
|
||||||
|
return obj
|
||||||
return self._postproc_obj(obj, src=src, trace=trace)
|
|
||||||
|
|
||||||
def _load_many(
|
def _load_many(
|
||||||
self,
|
self,
|
||||||
|
|
@ -144,19 +167,49 @@ class _TabbyLoader:
|
||||||
|
|
||||||
# the table field/column names have purposefully _nothing_
|
# the table field/column names have purposefully _nothing_
|
||||||
# to do with any possibly loaded JSON data
|
# to do with any possibly loaded JSON data
|
||||||
fieldnames = None
|
if self._encoding is not None:
|
||||||
|
tsv_array = self._parse_tsv_many(
|
||||||
|
src, obj_tmpl, trace=trace, fieldnames=None, encoding=self._encoding
|
||||||
|
)
|
||||||
|
else:
|
||||||
|
try:
|
||||||
|
tsv_array = self._parse_tsv_many(
|
||||||
|
src, obj_tmpl, trace=trace, fieldnames=None
|
||||||
|
)
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
# by default Path.open() uses locale.getencoding()
|
||||||
|
# that didn't work, try guessing
|
||||||
|
encoding = cs_from_path(src).best().encoding
|
||||||
|
tsv_array = self._parse_tsv_many(
|
||||||
|
src, obj_tmpl, trace=trace, fieldnames=None, encoding=encoding
|
||||||
|
)
|
||||||
|
|
||||||
with src.open(newline='') as tsvfile:
|
array.extend(tsv_array)
|
||||||
|
|
||||||
|
return array
|
||||||
|
|
||||||
|
def _parse_tsv_many(
|
||||||
|
self,
|
||||||
|
src: Path,
|
||||||
|
obj_tmpl: Dict,
|
||||||
|
trace: List,
|
||||||
|
fieldnames: List | None = None,
|
||||||
|
encoding: str | None = None,
|
||||||
|
) -> List[Dict]:
|
||||||
|
array = []
|
||||||
|
with src.open(newline="", encoding=encoding) as tsvfile:
|
||||||
# we cannot use DictReader -- we need to support identically named
|
# we cannot use DictReader -- we need to support identically named
|
||||||
# columns
|
# columns
|
||||||
reader = csv.reader(tsvfile, delimiter='\t')
|
reader = csv.reader(tsvfile, delimiter="\t")
|
||||||
# row_id is useful for error reporting
|
# row_id is useful for error reporting
|
||||||
for row_id, row in enumerate(reader):
|
for row_id, row in enumerate(reader):
|
||||||
# row is a list of field, with only as many items
|
# row is a list of field, with only as many items
|
||||||
# as this particular row has columns
|
# as this particular row has columns
|
||||||
if not len(row) \
|
if (
|
||||||
or row[0].startswith('#') \
|
not len(row)
|
||||||
or all(v is None for v in row):
|
or row[0].startswith("#")
|
||||||
|
or all(v is None for v in row)
|
||||||
|
):
|
||||||
# skip empty rows, rows with no key, or rows with
|
# skip empty rows, rows with no key, or rows with
|
||||||
# a comment key
|
# a comment key
|
||||||
continue
|
continue
|
||||||
|
|
|
||||||
|
|
@ -17,6 +17,7 @@ install_requires =
|
||||||
datalad >= 0.18.0
|
datalad >= 0.18.0
|
||||||
datalad-next @ git+https://github.com/datalad/datalad-next.git@main
|
datalad-next @ git+https://github.com/datalad/datalad-next.git@main
|
||||||
datalad-metalad
|
datalad-metalad
|
||||||
|
charset-normalizer
|
||||||
openpyxl
|
openpyxl
|
||||||
pyld
|
pyld
|
||||||
packages = find_namespace:
|
packages = find_namespace:
|
||||||
|
|
|
||||||
Loading…
Add table
Add a link
Reference in a new issue