dataset

Classes

HKDataset

HKDataset(path: os.PathLike, cacheable: bool = True)

HKDataset serves as a base class to download and provide unified access to datasets.

Parameters:

path (PathLike) –

Path to dataset
cacheable (bool, default: True ) –

If dataset supports file caching. Defaults

Example:

import numpy as np
import heartkit as hk

class MyDataset(hk.HKDataset):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)

    @property
    def name(self) -> str:
        return 'my-dataset'

    @property
    def sampling_rate(self) -> int:
        return 100

    def get_train_patient_ids(self) -> npt.NDArray:
        return np.arange(80)

    def get_test_patient_ids(self) -> npt.NDArray:
        return np.arange(80, 100)

    @contextlib.contextmanager
    def patient_data(self, patient_id: int) -> Generator[PatientData, None, None]:
        data = np.random.randn(1000)
        segs = np.random.randint(0, 1000, (10, 2))
        yield {"data": data, "segmentations": segs}

    def signal_generator(
        self,
        patient_generator: PatientGenerator,
        frame_size: int,
        samples_per_patient: int = 1,
        target_rate: int | None = None,
    ) -> Generator[npt.NDArray, None, None]:
        for patient in patient_generator:
            for _ in range(samples_per_patient):
                with self.patient_data(patient) as pt:
                    yield pt["data"]

    def download(self, num_workers: int | None = None, force: bool = False):
        pass

# Register dataset
hk.DatasetFactory.register("my-dataset", MyDataset)

Source code in heartkit/datasets/dataset.py

def __init__(self, path: os.PathLike, cacheable: bool = True) -> None:
    """HKDataset serves as a base class to download and provide unified access to datasets.

    Args:
        path (os.PathLike): Path to dataset
        cacheable (bool, optional): If dataset supports file caching. Defaults

    Example:

    ```python
    import numpy as np
    import heartkit as hk

    class MyDataset(hk.HKDataset):
        def __init__(self, *args, **kwargs):
            super().__init__(*args, **kwargs)

        @property
        def name(self) -> str:
            return 'my-dataset'

        @property
        def sampling_rate(self) -> int:
            return 100

        def get_train_patient_ids(self) -> npt.NDArray:
            return np.arange(80)

        def get_test_patient_ids(self) -> npt.NDArray:
            return np.arange(80, 100)

        @contextlib.contextmanager
        def patient_data(self, patient_id: int) -> Generator[PatientData, None, None]:
            data = np.random.randn(1000)
            segs = np.random.randint(0, 1000, (10, 2))
            yield {"data": data, "segmentations": segs}

        def signal_generator(
            self,
            patient_generator: PatientGenerator,
            frame_size: int,
            samples_per_patient: int = 1,
            target_rate: int | None = None,
        ) -> Generator[npt.NDArray, None, None]:
            for patient in patient_generator:
                for _ in range(samples_per_patient):
                    with self.patient_data(patient) as pt:
                        yield pt["data"]

        def download(self, num_workers: int | None = None, force: bool = False):
            pass

    # Register dataset
    hk.DatasetFactory.register("my-dataset", MyDataset)
    ```
    """
    self.path = Path(path)
    self._cacheable = cacheable
    self._cached_data = {}

Attributes

name `property`

name: str

Dataset name

cacheable `property` `writable`

cacheable: bool

If dataset supports in-memory caching.

On smaller datasets, it is recommended to cache the entire dataset in memory.

sampling_rate `property`

sampling_rate: int

Sampling rate in Hz

mean `property`

mean: float

Dataset mean

std `property`

std: float

Dataset st dev

Functions

get_train_patient_ids

get_train_patient_ids() -> npt.NDArray

Get dataset's defined training patient IDs

Returns:

NDArray –

npt.NDArray: patient IDs

Source code in heartkit/datasets/dataset.py

def get_train_patient_ids(self) -> npt.NDArray:
    """Get dataset's defined training patient IDs

    Returns:
        npt.NDArray: patient IDs
    """
    raise NotImplementedError()

get_test_patient_ids

get_test_patient_ids() -> npt.NDArray

Get dataset's patient IDs reserved for testing only

Returns:

NDArray –

npt.NDArray: patient IDs

Source code in heartkit/datasets/dataset.py

def get_test_patient_ids(self) -> npt.NDArray:
    """Get dataset's patient IDs reserved for testing only

    Returns:
        npt.NDArray: patient IDs
    """
    raise NotImplementedError()

patient_data

patient_data(patient_id: int) -> Generator[PatientData, None, None]

Get patient data

Parameters:

patient_id (int) –

Patient ID

Returns:

None –

Generator[PatientData, None, None]: Patient data

Source code in heartkit/datasets/dataset.py

@contextlib.contextmanager
def patient_data(self, patient_id: int) -> Generator[PatientData, None, None]:
    """Get patient data

    Args:
        patient_id (int): Patient ID

    Returns:
        Generator[PatientData, None, None]: Patient data
    """
    raise NotImplementedError()

signal_generator

signal_generator(
    patient_generator: PatientGenerator, frame_size: int, samples_per_patient: int = 1, target_rate: int | None = None
) -> Generator[npt.NDArray, None, None]

Generate random frames.

Parameters:

patient_generator (PatientGenerator) –

Generator that yields patient data.
frame_size (int) –

Frame size
samples_per_patient (int, default: 1 ) –

Samples per patient. Defaults to 1.
target_rate (int | None, default: None ) –

Target rate. Defaults to None.

Returns:

None –

Generator[npt.NDArray, None, None]: Generator sample of data

Source code in heartkit/datasets/dataset.py

def signal_generator(
    self,
    patient_generator: PatientGenerator,
    frame_size: int,
    samples_per_patient: int = 1,
    target_rate: int | None = None,
) -> Generator[npt.NDArray, None, None]:
    """Generate random frames.

    Args:
        patient_generator (PatientGenerator): Generator that yields patient data.
        frame_size (int): Frame size
        samples_per_patient (int, optional): Samples per patient. Defaults to 1.
        target_rate (int | None, optional): Target rate. Defaults to None.

    Returns:
        Generator[npt.NDArray, None, None]: Generator sample of data
    """
    raise NotImplementedError()

download

download(num_workers: int | None = None, force: bool = False)

Download dataset

Parameters:

num_workers (int | None, default: None ) –

parallel workers. Defaults to None.
force (bool, default: False ) –

Force redownload. Defaults to False.

Source code in heartkit/datasets/dataset.py

def download(self, num_workers: int | None = None, force: bool = False):
    """Download dataset

    Args:
        num_workers (int | None, optional): # parallel workers. Defaults to None.
        force (bool, optional): Force redownload. Defaults to False.
    """
    raise NotImplementedError()

split_train_test_patients

split_train_test_patients(
    patient_ids: npt.NDArray, test_size: float, label_map: dict[int, int] | None = None, label_type: str | None = None
) -> list[list[int]]

Perform train/test split on patients for given task. NOTE: We only perform inter-patient splits and not intra-patient.

Parameters:

patient_ids (NDArray) –

Patient Ids
test_size (float) –

Test size
label_map (dict[int, int], default: None ) –

Label map. Defaults to None.
label_type (str, default: None ) –

Label type. Defaults to None.

Returns:

list[list[int]] –

list[list[int]]: Train and test sets of patient ids

Source code in heartkit/datasets/dataset.py

def split_train_test_patients(
    self,
    patient_ids: npt.NDArray,
    test_size: float,
    label_map: dict[int, int] | None = None,
    label_type: str | None = None,
) -> list[list[int]]:
    """Perform train/test split on patients for given task.
    NOTE: We only perform inter-patient splits and not intra-patient.

    Args:
        patient_ids (npt.NDArray): Patient Ids
        test_size (float): Test size
        label_map (dict[int, int], optional): Label map. Defaults to None.
        label_type (str, optional): Label type. Defaults to None.

    Returns:
        list[list[int]]: Train and test sets of patient ids
    """
    return sklearn.model_selection.train_test_split(patient_ids, test_size=test_size)

filter_patients_for_labels

filter_patients_for_labels(
    patient_ids: npt.NDArray, label_map: dict[int, int] | None = None, label_type: str | None = None
) -> npt.NDArray

Filter patients for given labels.

Parameters:

patient_ids (NDArray) –

Patient ids
label_map (dict[int, int], default: None ) –

Label map. Defaults to None.
label_type (str, default: None ) –

Label type. Defaults to None.

Returns:

NDArray –

npt.NDArray: Filtered patient ids

Source code in heartkit/datasets/dataset.py

def filter_patients_for_labels(
    self, patient_ids: npt.NDArray, label_map: dict[int, int] | None = None, label_type: str | None = None
) -> npt.NDArray:
    """Filter patients for given labels.

    Args:
        patient_ids (npt.NDArray): Patient ids
        label_map (dict[int, int], optional): Label map. Defaults to None.
        label_type (str, optional): Label type. Defaults to None.

    Returns:
        npt.NDArray: Filtered patient ids
    """
    return patient_ids

close

close()

Close dataset

Source code in heartkit/datasets/dataset.py

def close(self):
    """Close dataset"""

dataset

Classes

HKDataset

Attributes

name property

cacheable property writable

sampling_rate property

mean property

std property

Functions

get_train_patient_ids

get_test_patient_ids

patient_data

signal_generator

download

parallel workers. Defaults to None.

split_train_test_patients

filter_patients_for_labels

close

name `property`

cacheable `property` `writable`

sampling_rate `property`

mean `property`

std `property`