Source code for arcana.core.data.row

from __future__ import annotations
from pathlib import Path
import typing as ty
import attrs
from collections import defaultdict
from abc import ABCMeta
from arcana.core.exceptions import (
    ArcanaNameError,
    ArcanaWrongFrequencyError,
)
from fileformats.core.exceptions import FileFormatError
from fileformats.core.base import DataType
from fileformats.core.quality import DataQuality
from .space import DataSpace

if ty.TYPE_CHECKING:
    import arcana.core.data.set


[docs]@attrs.define(auto_detect=True) class DataRow: """A "row" in a dataset "frame" where file-groups and fields can be placed, e.g. a session or subject. Parameters ---------- ids : Dict[DataSpace, str] The ids for the frequency of the row and all "parent" frequencies within the tree frequency : DataSpace The frequency of the row dataset : Dataset A reference to the root of the data tree """ ids: ty.Dict[DataSpace, str] = attrs.field() frequency: DataSpace = attrs.field() dataset: arcana.core.data.set.Dataset = attrs.field(repr=False) children: ty.DefaultDict[ DataSpace, ty.Dict[ty.Union[str, ty.Tuple[str]], str] ] = attrs.field(factory=lambda: defaultdict(dict), repr=False) _unresolved = attrs.field(default=None, repr=False) _items = attrs.field(factory=dict, init=False, repr=False) def __getitem__(self, column_name): """Gets the item for the current row Parameters ---------- column_name : str Name of a selected column in the dataset Returns ------- DataType The item matching the provided name specified by the column name """ if column_name in self._items: return self._items[column_name] else: try: spec = self.dataset[column_name] except KeyError as e: raise ArcanaNameError( column_name, f"{column_name} is not the name of a column in " f"{self.dataset.id} dataset ('" + "', '".join(self.dataset.columns) + "')", ) from e if spec.row_frequency != self.frequency: return ArcanaWrongFrequencyError( column_name, f"'column_name' ({column_name}) is of {spec.row_frequency} " f"frequency and therefore not in rows of {self.frequency}" " frequency", ) item = self._items[column_name] = spec.match(self) return item def __setitem__(self, column_name, value): item = self[column_name] item.put(value) return item def __repr__(self): return f"{type(self).__name__}(id={self.id}, frequency={self.frequency})" @property def id(self): return self.ids[self.frequency] @property def label(self): return self.path[-1] def __iter__(self): return iter(self.keys()) def keys(self): return (n for n, _ in self.items()) def values(self): return (i for _, i in self.items()) def items(self): return ( (c.name, self[c.name]) for c in self.dataset.columns.values() if c.row_frequency == self.frequency ) def column_items(self, column_name): """Gets the item for the current row if item's frequency matches otherwise gets all the items that are related to the current row ( i.e. are in child rows) Parameters ---------- column_name : str Name of a selected column in the dataset Returns ------- Sequence[DataType] The item matching the provided name specified by the column name if the column is of matching or ancestor frequency, or list of items if a descendent or unrelated frequency. """ try: return [self[column_name]] except ArcanaWrongFrequencyError: # If frequency is not a ancestor row then return the # items in the children of the row (if they are child # rows) or the whole dataset spec = self.dataset.columns[column_name] try: return self.children[spec.row_frequency].values() except KeyError: return self.dataset.column(spec.row_frequency) @property def unresolved(self): if self._unresolved is None: self._unresolved = [] self.dataset.store.find_items(self) return self._unresolved def resolved(self, datatype): """ Items in the row that are able to be resolved to the given datatype Parameters ---------- datatype : type The file datatype or type to reolve the item to """ matches = [] for potential in self.unresolved: try: matches.append(datatype.resolve(potential)) except FileFormatError: pass return matches @property def ids_tuple(self): return self.dataset.ids_tuple(self.ids) def add_file_group(self, path, **kwargs): if self._unresolved is None: self._unresolved = [] self._unresolved.append(UnresolvedFileGroup(path=path, row=self, **kwargs)) def add_field(self, path, value, **kwargs): if self._unresolved is None: self._unresolved = [] self._unresolved.append( UnresolvedField(path=path, row=self, value=value, **kwargs) )
@attrs.define class UnresolvedDataType(metaclass=ABCMeta): """A file-group stored in, potentially multiple, unknown file formats. File formats are resolved by providing a list of candidates to the 'resolve' method Parameters ---------- path : str The name_path to the relative location of the file group, i.e. excluding information about which row in the data tree it belongs to order : int | None The ID of the file_group in the session. To be used to distinguish multiple file_groups with the same scan type in the same session, e.g. scans taken before and after a task. For datasets where this isn't stored (i.e. Local), id can be None quality : DataQuality The quality label assigned to the file_group (e.g. as is saved on XNAT) provenance : Provenance | None The provenance for the pipeline that generated the file-group, if applicable """ path: str = attrs.field(default=None) row: DataRow = attrs.field(default=None) order: int = attrs.field(default=None) quality: DataQuality = attrs.field(default=DataQuality.usable) provenance: ty.Dict[str, ty.Any] = attrs.field(default=None) _matched: ty.Dict[str, DataType] = attrs.field(factory=dict, init=False) @property def item_kwargs(self): return { "path": self.path, "order": self.order, "row": self.row, "quality": self.quality, } def normalise_paths(file_paths): "Convert all file paths to absolute real paths" if file_paths: file_paths = [Path(p).absolute() for p in file_paths] return file_paths @attrs.define class UnresolvedFileGroup(UnresolvedDataType): """A file-group stored in, potentially multiple, unknown file formats. File formats are resolved by providing a list of candidates to the 'resolve' method Parameters ---------- name_path : str The name_path to the relative location of the file group, i.e. excluding information about which row in the data tree it belongs to order : int | None The ID of the file_group in the session. To be used to distinguish multiple file_groups with the same scan type in the same session, e.g. scans taken before and after a task. For datasets where this isn't stored (i.e. Local), id can be None quality : DataQuality The quality label assigned to the file_group (e.g. as is saved on XNAT) provenance : Provenance | None The provenance for the pipeline that generated the file-group, if applicable row : DataRow The data row that the field belongs to file_paths : Sequence[str] | None Path to the file-group in the local cache uris : Dict[str, str] | None For stores where the name of the file datatype is saved with the data (i.e. XNAT), the name of the resource enables straightforward datatype identification. It is stored here along with URIs corresponding to each resource """ file_paths: ty.Sequence[Path] = attrs.field(factory=list, converter=normalise_paths) uris: ty.Dict[str] = attrs.field(default=None) @classmethod def from_paths(cls, base_dir: Path, paths: ty.List[Path], **kwargs): groups = defaultdict(list) for path in paths: relpath = path.relative_to(base_dir) path_stem = str(relpath)[: -len("".join(relpath.suffixes))] groups[path_stem].append(path) # No extension case # Add all possible stems for i in range(len(relpath.suffixes)): groups["".join([path_stem] + relpath.suffixes[: (i + 1)])].append(path) return [cls(path=p, file_paths=g, **kwargs) for p, g in groups.items()] @attrs.define class UnresolvedField(UnresolvedDataType): """A file-group stored in, potentially multiple, unknown file formats. File formats are resolved by providing a list of candidates to the 'resolve' method Parameters ---------- path : str The name_path to the relative location of the file group, i.e. excluding information about which row in the data tree it belongs to value : str The value assigned to the unresolved data item (for fields instead of file groups) order : int | None The ID of the file_group in the session. To be used to distinguish multiple file_groups with the same scan type in the same session, e.g. scans taken before and after a task. For datasets where this isn't stored (i.e. Local), id can be None quality : DataQuality The quality label assigned to the file_group (e.g. as is saved on XNAT) provenance : Provenance | None The provenance for the pipeline that generated the file-group, if applicable row : DataRow The data row that the field belongs to """ value: ty.Union[ float, int, str, ty.List[float], ty.List[int], ty.List[str] ] = attrs.field(default=None) # def _resolve(self, datatype): # try: # if datatype._name == 'Sequence': # if len(datatype.__args__) > 1: # raise ArcanaUsageError( # f"Sequence formats with more than one arg " # "are not supported ({datatype})") # subtype = datatype.__args__[0] # value = [subtype(v) # for v in self.value[1:-1].split(',')] # else: # value = datatype(self.value) # except ValueError as e: # raise ArcanaUnresolvableFormatException( # f"Could not convert value of {self} ({self.value}) " # f"to datatype {datatype}") from e # else: # item = DataType(value=value, **self.item_kwargs) # return item