Skip to content

dataset

Classes

HKDataset

HKDataset(path: os.PathLike, cacheable: bool = True)

HKDataset serves as a base class to download and provide unified access to datasets.

Parameters:

  • path (PathLike) –

    Path to dataset

  • cacheable (bool, default: True ) –

    If dataset supports file caching. Defaults

Example:

import numpy as np
import heartkit as hk

class MyDataset(hk.HKDataset):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    @property
    def name(self) -> str:
        return 'my-dataset'

    @property
    def sampling_rate(self) -> int:
        return 100

    def get_train_patient_ids(self) -> npt.NDArray:
        return np.arange(80)

    def get_test_patient_ids(self) -> npt.NDArray:
        return np.arange(80, 100)

    @contextlib.contextmanager
    def patient_data(self, patient_id: int) -> Generator[PatientData, None, None]:
        data = np.random.randn(1000)
        segs = np.random.randint(0, 1000, (10, 2))
        yield {"data": data, "segmentations": segs}

    def signal_generator(
        self,
        patient_generator: PatientGenerator,
        frame_size: int,
        samples_per_patient: int = 1,
        target_rate: int | None = None,
    ) -> Generator[npt.NDArray, None, None]:
        for patient in patient_generator:
            for _ in range(samples_per_patient):
                with self.patient_data(patient) as pt:
                    yield pt["data"]

    def download(self, num_workers: int | None = None, force: bool = False):
        pass

# Register dataset
hk.DatasetFactory.register("my-dataset", MyDataset)
Source code in heartkit/datasets/dataset.py
def __init__(self, path: os.PathLike, cacheable: bool = True) -> None:
    """HKDataset serves as a base class to download and provide unified access to datasets.

    Args:
        path (os.PathLike): Path to dataset
        cacheable (bool, optional): If dataset supports file caching. Defaults

    Example:

    ```python
    import numpy as np
    import heartkit as hk

    class MyDataset(hk.HKDataset):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)

        @property
        def name(self) -> str:
            return 'my-dataset'

        @property
        def sampling_rate(self) -> int:
            return 100

        def get_train_patient_ids(self) -> npt.NDArray:
            return np.arange(80)

        def get_test_patient_ids(self) -> npt.NDArray:
            return np.arange(80, 100)

        @contextlib.contextmanager
        def patient_data(self, patient_id: int) -> Generator[PatientData, None, None]:
            data = np.random.randn(1000)
            segs = np.random.randint(0, 1000, (10, 2))
            yield {"data": data, "segmentations": segs}

        def signal_generator(
            self,
            patient_generator: PatientGenerator,
            frame_size: int,
            samples_per_patient: int = 1,
            target_rate: int | None = None,
        ) -> Generator[npt.NDArray, None, None]:
            for patient in patient_generator:
                for _ in range(samples_per_patient):
                    with self.patient_data(patient) as pt:
                        yield pt["data"]

        def download(self, num_workers: int | None = None, force: bool = False):
            pass

    # Register dataset
    hk.DatasetFactory.register("my-dataset", MyDataset)
    ```
    """
    self.path = Path(path)
    self._cacheable = cacheable
    self._cached_data = {}

Attributes

name property
name: str

Dataset name

cacheable property writable
cacheable: bool

If dataset supports in-memory caching.

On smaller datasets, it is recommended to cache the entire dataset in memory.

sampling_rate property
sampling_rate: int

Sampling rate in Hz

mean property
mean: float

Dataset mean

std property
std: float

Dataset st dev

Functions

get_train_patient_ids
get_train_patient_ids() -> npt.NDArray

Get dataset's defined training patient IDs

Returns:

  • NDArray

    npt.NDArray: patient IDs

Source code in heartkit/datasets/dataset.py
def get_train_patient_ids(self) -> npt.NDArray:
    """Get dataset's defined training patient IDs

    Returns:
        npt.NDArray: patient IDs
    """
    raise NotImplementedError()
get_test_patient_ids
get_test_patient_ids() -> npt.NDArray

Get dataset's patient IDs reserved for testing only

Returns:

  • NDArray

    npt.NDArray: patient IDs

Source code in heartkit/datasets/dataset.py
def get_test_patient_ids(self) -> npt.NDArray:
    """Get dataset's patient IDs reserved for testing only

    Returns:
        npt.NDArray: patient IDs
    """
    raise NotImplementedError()
patient_data
patient_data(patient_id: int) -> Generator[PatientData, None, None]

Get patient data

Parameters:

  • patient_id (int) –

    Patient ID

Returns:

  • None

    Generator[PatientData, None, None]: Patient data

Source code in heartkit/datasets/dataset.py
@contextlib.contextmanager
def patient_data(self, patient_id: int) -> Generator[PatientData, None, None]:
    """Get patient data

    Args:
        patient_id (int): Patient ID

    Returns:
        Generator[PatientData, None, None]: Patient data
    """
    raise NotImplementedError()
signal_generator
signal_generator(
    patient_generator: PatientGenerator, frame_size: int, samples_per_patient: int = 1, target_rate: int | None = None
) -> Generator[npt.NDArray, None, None]

Generate random frames.

Parameters:

  • patient_generator (PatientGenerator) –

    Generator that yields patient data.

  • frame_size (int) –

    Frame size

  • samples_per_patient (int, default: 1 ) –

    Samples per patient. Defaults to 1.

  • target_rate (int | None, default: None ) –

    Target rate. Defaults to None.

Returns:

  • None

    Generator[npt.NDArray, None, None]: Generator sample of data

Source code in heartkit/datasets/dataset.py
def signal_generator(
    self,
    patient_generator: PatientGenerator,
    frame_size: int,
    samples_per_patient: int = 1,
    target_rate: int | None = None,
) -> Generator[npt.NDArray, None, None]:
    """Generate random frames.

    Args:
        patient_generator (PatientGenerator): Generator that yields patient data.
        frame_size (int): Frame size
        samples_per_patient (int, optional): Samples per patient. Defaults to 1.
        target_rate (int | None, optional): Target rate. Defaults to None.

    Returns:
        Generator[npt.NDArray, None, None]: Generator sample of data
    """
    raise NotImplementedError()
download
download(num_workers: int | None = None, force: bool = False)

Download dataset

Parameters:

  • num_workers (int | None, default: None ) –
    parallel workers. Defaults to None.
  • force (bool, default: False ) –

    Force redownload. Defaults to False.

Source code in heartkit/datasets/dataset.py
def download(self, num_workers: int | None = None, force: bool = False):
    """Download dataset

    Args:
        num_workers (int | None, optional): # parallel workers. Defaults to None.
        force (bool, optional): Force redownload. Defaults to False.
    """
    raise NotImplementedError()
split_train_test_patients
split_train_test_patients(
    patient_ids: npt.NDArray, test_size: float, label_map: dict[int, int] | None = None, label_type: str | None = None
) -> list[list[int]]

Perform train/test split on patients for given task. NOTE: We only perform inter-patient splits and not intra-patient.

Parameters:

  • patient_ids (NDArray) –

    Patient Ids

  • test_size (float) –

    Test size

  • label_map (dict[int, int], default: None ) –

    Label map. Defaults to None.

  • label_type (str, default: None ) –

    Label type. Defaults to None.

Returns:

  • list[list[int]]

    list[list[int]]: Train and test sets of patient ids

Source code in heartkit/datasets/dataset.py
def split_train_test_patients(
    self,
    patient_ids: npt.NDArray,
    test_size: float,
    label_map: dict[int, int] | None = None,
    label_type: str | None = None,
) -> list[list[int]]:
    """Perform train/test split on patients for given task.
    NOTE: We only perform inter-patient splits and not intra-patient.

    Args:
        patient_ids (npt.NDArray): Patient Ids
        test_size (float): Test size
        label_map (dict[int, int], optional): Label map. Defaults to None.
        label_type (str, optional): Label type. Defaults to None.

    Returns:
        list[list[int]]: Train and test sets of patient ids
    """
    return sklearn.model_selection.train_test_split(patient_ids, test_size=test_size)
filter_patients_for_labels
filter_patients_for_labels(
    patient_ids: npt.NDArray, label_map: dict[int, int] | None = None, label_type: str | None = None
) -> npt.NDArray

Filter patients for given labels.

Parameters:

  • patient_ids (NDArray) –

    Patient ids

  • label_map (dict[int, int], default: None ) –

    Label map. Defaults to None.

  • label_type (str, default: None ) –

    Label type. Defaults to None.

Returns:

  • NDArray

    npt.NDArray: Filtered patient ids

Source code in heartkit/datasets/dataset.py
def filter_patients_for_labels(
    self, patient_ids: npt.NDArray, label_map: dict[int, int] | None = None, label_type: str | None = None
) -> npt.NDArray:
    """Filter patients for given labels.

    Args:
        patient_ids (npt.NDArray): Patient ids
        label_map (dict[int, int], optional): Label map. Defaults to None.
        label_type (str, optional): Label type. Defaults to None.

    Returns:
        npt.NDArray: Filtered patient ids
    """
    return patient_ids
close
close()

Close dataset

Source code in heartkit/datasets/dataset.py
def close(self):
    """Close dataset"""