from __future__ import annotations
from pathlib import Path
import typing as ty
import attrs
from collections import defaultdict
from abc import ABCMeta
from arcana.core.exceptions import (
ArcanaNameError,
ArcanaWrongFrequencyError,
)
from fileformats.core.exceptions import FileFormatError
from fileformats.core.base import DataType
from fileformats.core.quality import DataQuality
from .space import DataSpace
if ty.TYPE_CHECKING:
import arcana.core.data.set
[docs]@attrs.define(auto_detect=True)
class DataRow:
"""A "row" in a dataset "frame" where file-groups and fields can be placed, e.g.
a session or subject.
Parameters
----------
ids : Dict[DataSpace, str]
The ids for the frequency of the row and all "parent" frequencies
within the tree
frequency : DataSpace
The frequency of the row
dataset : Dataset
A reference to the root of the data tree
"""
ids: ty.Dict[DataSpace, str] = attrs.field()
frequency: DataSpace = attrs.field()
dataset: arcana.core.data.set.Dataset = attrs.field(repr=False)
children: ty.DefaultDict[
DataSpace, ty.Dict[ty.Union[str, ty.Tuple[str]], str]
] = attrs.field(factory=lambda: defaultdict(dict), repr=False)
_unresolved = attrs.field(default=None, repr=False)
_items = attrs.field(factory=dict, init=False, repr=False)
def __getitem__(self, column_name):
"""Gets the item for the current row
Parameters
----------
column_name : str
Name of a selected column in the dataset
Returns
-------
DataType
The item matching the provided name specified by the column name
"""
if column_name in self._items:
return self._items[column_name]
else:
try:
spec = self.dataset[column_name]
except KeyError as e:
raise ArcanaNameError(
column_name,
f"{column_name} is not the name of a column in "
f"{self.dataset.id} dataset ('"
+ "', '".join(self.dataset.columns)
+ "')",
) from e
if spec.row_frequency != self.frequency:
return ArcanaWrongFrequencyError(
column_name,
f"'column_name' ({column_name}) is of {spec.row_frequency} "
f"frequency and therefore not in rows of {self.frequency}"
" frequency",
)
item = self._items[column_name] = spec.match(self)
return item
def __setitem__(self, column_name, value):
item = self[column_name]
item.put(value)
return item
def __repr__(self):
return f"{type(self).__name__}(id={self.id}, frequency={self.frequency})"
@property
def id(self):
return self.ids[self.frequency]
@property
def label(self):
return self.path[-1]
def __iter__(self):
return iter(self.keys())
def keys(self):
return (n for n, _ in self.items())
def values(self):
return (i for _, i in self.items())
def items(self):
return (
(c.name, self[c.name])
for c in self.dataset.columns.values()
if c.row_frequency == self.frequency
)
def column_items(self, column_name):
"""Gets the item for the current row if item's frequency matches
otherwise gets all the items that are related to the current row (
i.e. are in child rows)
Parameters
----------
column_name : str
Name of a selected column in the dataset
Returns
-------
Sequence[DataType]
The item matching the provided name specified by the column name
if the column is of matching or ancestor frequency, or list of
items if a descendent or unrelated frequency.
"""
try:
return [self[column_name]]
except ArcanaWrongFrequencyError:
# If frequency is not a ancestor row then return the
# items in the children of the row (if they are child
# rows) or the whole dataset
spec = self.dataset.columns[column_name]
try:
return self.children[spec.row_frequency].values()
except KeyError:
return self.dataset.column(spec.row_frequency)
@property
def unresolved(self):
if self._unresolved is None:
self._unresolved = []
self.dataset.store.find_items(self)
return self._unresolved
def resolved(self, datatype):
"""
Items in the row that are able to be resolved to the given datatype
Parameters
----------
datatype : type
The file datatype or type to reolve the item to
"""
matches = []
for potential in self.unresolved:
try:
matches.append(datatype.resolve(potential))
except FileFormatError:
pass
return matches
@property
def ids_tuple(self):
return self.dataset.ids_tuple(self.ids)
def add_file_group(self, path, **kwargs):
if self._unresolved is None:
self._unresolved = []
self._unresolved.append(UnresolvedFileGroup(path=path, row=self, **kwargs))
def add_field(self, path, value, **kwargs):
if self._unresolved is None:
self._unresolved = []
self._unresolved.append(
UnresolvedField(path=path, row=self, value=value, **kwargs)
)
@attrs.define
class UnresolvedDataType(metaclass=ABCMeta):
"""A file-group stored in, potentially multiple, unknown file formats.
File formats are resolved by providing a list of candidates to the
'resolve' method
Parameters
----------
path : str
The name_path to the relative location of the file group, i.e. excluding
information about which row in the data tree it belongs to
order : int | None
The ID of the file_group in the session. To be used to
distinguish multiple file_groups with the same scan type in the
same session, e.g. scans taken before and after a task. For
datasets where this isn't stored (i.e. Local), id can be None
quality : DataQuality
The quality label assigned to the file_group (e.g. as is saved on XNAT)
provenance : Provenance | None
The provenance for the pipeline that generated the file-group,
if applicable
"""
path: str = attrs.field(default=None)
row: DataRow = attrs.field(default=None)
order: int = attrs.field(default=None)
quality: DataQuality = attrs.field(default=DataQuality.usable)
provenance: ty.Dict[str, ty.Any] = attrs.field(default=None)
_matched: ty.Dict[str, DataType] = attrs.field(factory=dict, init=False)
@property
def item_kwargs(self):
return {
"path": self.path,
"order": self.order,
"row": self.row,
"quality": self.quality,
}
def normalise_paths(file_paths):
"Convert all file paths to absolute real paths"
if file_paths:
file_paths = [Path(p).absolute() for p in file_paths]
return file_paths
@attrs.define
class UnresolvedFileGroup(UnresolvedDataType):
"""A file-group stored in, potentially multiple, unknown file formats.
File formats are resolved by providing a list of candidates to the
'resolve' method
Parameters
----------
name_path : str
The name_path to the relative location of the file group, i.e. excluding
information about which row in the data tree it belongs to
order : int | None
The ID of the file_group in the session. To be used to
distinguish multiple file_groups with the same scan type in the
same session, e.g. scans taken before and after a task. For
datasets where this isn't stored (i.e. Local), id can be None
quality : DataQuality
The quality label assigned to the file_group (e.g. as is saved on XNAT)
provenance : Provenance | None
The provenance for the pipeline that generated the file-group,
if applicable
row : DataRow
The data row that the field belongs to
file_paths : Sequence[str] | None
Path to the file-group in the local cache
uris : Dict[str, str] | None
For stores where the name of the file datatype is saved with the
data (i.e. XNAT), the name of the resource enables straightforward
datatype identification. It is stored here along with URIs corresponding
to each resource
"""
file_paths: ty.Sequence[Path] = attrs.field(factory=list, converter=normalise_paths)
uris: ty.Dict[str] = attrs.field(default=None)
@classmethod
def from_paths(cls, base_dir: Path, paths: ty.List[Path], **kwargs):
groups = defaultdict(list)
for path in paths:
relpath = path.relative_to(base_dir)
path_stem = str(relpath)[: -len("".join(relpath.suffixes))]
groups[path_stem].append(path) # No extension case
# Add all possible stems
for i in range(len(relpath.suffixes)):
groups["".join([path_stem] + relpath.suffixes[: (i + 1)])].append(path)
return [cls(path=p, file_paths=g, **kwargs) for p, g in groups.items()]
@attrs.define
class UnresolvedField(UnresolvedDataType):
"""A file-group stored in, potentially multiple, unknown file formats.
File formats are resolved by providing a list of candidates to the
'resolve' method
Parameters
----------
path : str
The name_path to the relative location of the file group, i.e. excluding
information about which row in the data tree it belongs to
value : str
The value assigned to the unresolved data item (for fields instead of
file groups)
order : int | None
The ID of the file_group in the session. To be used to
distinguish multiple file_groups with the same scan type in the
same session, e.g. scans taken before and after a task. For
datasets where this isn't stored (i.e. Local), id can be None
quality : DataQuality
The quality label assigned to the file_group (e.g. as is saved on XNAT)
provenance : Provenance | None
The provenance for the pipeline that generated the file-group,
if applicable
row : DataRow
The data row that the field belongs to
"""
value: ty.Union[
float, int, str, ty.List[float], ty.List[int], ty.List[str]
] = attrs.field(default=None)
# def _resolve(self, datatype):
# try:
# if datatype._name == 'Sequence':
# if len(datatype.__args__) > 1:
# raise ArcanaUsageError(
# f"Sequence formats with more than one arg "
# "are not supported ({datatype})")
# subtype = datatype.__args__[0]
# value = [subtype(v)
# for v in self.value[1:-1].split(',')]
# else:
# value = datatype(self.value)
# except ValueError as e:
# raise ArcanaUnresolvableFormatException(
# f"Could not convert value of {self} ({self.value}) "
# f"to datatype {datatype}") from e
# else:
# item = DataType(value=value, **self.item_kwargs)
# return item