Source code for arcana.core.data.space

import typing as ty
import re
from enum import Enum
from arcana.core.utils.serialize import ClassResolver
from arcana.core.utils.misc import classproperty


[docs]class DataSpace(Enum): """ Base class for all "data space" enums. DataSpace enums specify the relationships between rows of a dataset. For example in imaging studies, scannings sessions are typically organised by analysis group (e.g. test & control), membership within the group (i.e matched subjects) and time-points (for longitudinal studies). We can visualise the rows arranged in a 3-D grid along the `group`, `member`, and `timepoint` dimensions. Note that datasets that only contain one group or time-point can still be represented in the same space, and just be of depth=1 along those dimensions. All dimensions should be included as members of a DataSpace subclass enum with orthogonal binary vector values, e.g. member = 0b001 group = 0b010 timepoint = 0b100 In this space, an imaging session row is uniquely defined by its member, group and timepoint ID. The most commonly present dimension should be given the least frequent bit (e.g. imaging datasets will not always have different groups or time-points but will always have different members (equivalent to subjects when there is one group). In addition to the data items stored in the data rows for each session, some items only vary along a particular dimension of the grid. The "row_frequency" of these rows can be specified using the "basis" members (i.e. member, group, timepoint) in contrast to the `session` row_frequency, which is the combination of all three session = 0b111 Additionally, some data is stored in aggregated rows that across a plane of the grid. These frequencies should also be added to the enum (all combinations of the basis frequencies must be included) and given intuitive names if possible, e.g. subject = 0b011 - uniquely identified subject within in the dataset. batch = 0b110 - separate group+timepoint combinations matchedpoint = 0b101 - matched members and time-points aggregated across groups Finally, for items that are singular across the whole dataset there should also be a dataset-wide member with value=0: dataset = 0b000 """ def __str__(self): return self.name @classmethod def leaf(cls): return max(cls) @classmethod def axes(cls): return cls.leaf().span() def span(self): """Returns the basis dimensions in the data tree that the given enum-member projects into. For example in `Clinical` data trees, the following frequencies can be decomposed into the following basis dims: dataset -> [] group -> [group] member -> [member] timepoint -> [timepoint] subject -> [group, member] batch -> [timepoint, group] matchedpoint -> [timepoint, member] session -> [timepoint, group, member] """ # Check which bits are '1', and append them to the list of levels cls = type(self) return [cls(b) for b in sorted(self.nonzero_bits(), reverse=True)] def nonzero_bits(self): v = self.value nonzero = [] while v: w = v & (v - 1) nonzero.append(w ^ v) v = w return nonzero def __iter__(self): "Iterate over bit string" bit = (max(type(self)).value + 1) >> 1 while bit > 0: yield bool(self.value & bit) bit >>= 1 def is_basis(self): return len(self._nonzero_bits()) == 1 def __eq__(self, other): return self.value == other.value def __lt__(self, other): return self.value < other.value def __le__(self, other): return self.value <= other.value def __xor__(self, other): return type(self)(self.value ^ other.value) def __and__(self, other): return type(self)(self.value & other.value) def __or__(self, other): return type(self)(self.value | other.value) def __invert__(self): return type(self)(~self.value) def __hash__(self): return self.value def __bool__(self): return bool(self.value) def bin(self): return bin(self.value) @classmethod def union(cls, freqs: ty.Sequence[Enum]): "Returns the union between data row_frequency values" union = cls(0) for f in freqs: union |= f return union @classmethod def default(cls): return max(cls) def is_parent(self, child, if_match=False): """Checks to see whether the current row_frequency is a "parent" of the other data row_frequency, i.e. all the base row_frequency of self appear in the "child". Parameters ---------- child : DataSpace The data row_frequency to check parent/child relationship with if_match : bool Treat matching frequencies as "parents" of each other Returns ------- bool True if self is parent of child """ return ((self & child) == self) and (child != self or if_match) def tostr(self): return f"{ClassResolver.tostr(self, strip_prefix=False)}[{str(self)}]" @classmethod def fromstr(cls, s): match = re.match(r"(.*)\[([^\]]+)\]", s) if match is None: raise ValueError( f"'{s}' is not a string of the format <data-space-enum>[<value>]" ) class_loc, val = match.groups() space = ClassResolver(cls)(class_loc) return space[val] if not isinstance(space, str) else s @classproperty def SUBPACKAGE(cls): """Cannot be a regular class attribute because then DataSpace won't be able to be extended""" return "data"
class Clinical(DataSpace): """ An enum that specifies the data hierarchy of data trees typical of medimage research, i.e. subjects split into groups scanned at different timepoints (in longitudinal studies). """ # Root row of the dataset dataset = 0b000 # singular within the dataset # Axes of the data space member = 0b001 # subjects relative to their group membership, i.e. # matched pairs of test and control subjects should share # the same member IDs. group = 0b010 # subject groups (e.g. test & control) timepoint = 0b100 # timepoints in longitudinal studies # Combinations session = 0b111 # a single session (i.e. a single timepoint of a subject) subject = 0b011 # uniquely identified subject within in the dataset. # As opposed to 'member', which specifies a subject in # relation to its group (i.e. one subject for each member # in each group). For datasets with only one study group, # then subject and member are equivalent batch = 0b110 # data from separate groups at separate timepoints matchedpoint = 0b101 # matched members (e.g. test & control) across # all groups and timepoints class Samples(DataSpace): """ The most basic data space within only one dimension """ # Root row of the dataset dataset = 0b0 # singular within the dataset # Axes of the data space sample = 0b1