Models¶

RVQ autoencoder model architecture using heliaEDGE components.

`compressionkit.models.rvq_autoencoder.build_rvq_autoencoder(frame_size, *, embedding_dim=16, latent_width=256, in_ch=1, out_ch=1, base_filters=32, multiplier=1.25, num_levels=2, beta=0.25, num_stages=4, encoder_block_norm='batch', encoder_head_norm='none', decoder_block_norm='none', decoder_head_norm='layer')` ¶

Build encoder, RVQ bottleneck, decoder, and composite VQAutoencoder.

Returns:

Type	Description
`Model`	`(encoder, rvq, decoder, model)` where model is a
`ResidualVectorQuantizer`	`helia_edge.trainers.VQAutoencoder` wrapping the three components.

Source code in compressionkit/models/rvq_autoencoder.py

def build_rvq_autoencoder(
    frame_size: int,
    *,
    embedding_dim: int = 16,
    latent_width: int = 256,
    in_ch: int = 1,
    out_ch: int = 1,
    base_filters: int = 32,
    multiplier: float = 1.25,
    num_levels: int = 2,
    beta: float = 0.25,
    num_stages: int = 4,
    encoder_block_norm: str = "batch",
    encoder_head_norm: str = "none",
    decoder_block_norm: str = "none",
    decoder_head_norm: str = "layer",
) -> tuple[keras.Model, ResidualVectorQuantizer, keras.Model, VQAutoencoder]:
    """Build encoder, RVQ bottleneck, decoder, and composite VQAutoencoder.

    Returns:
        ``(encoder, rvq, decoder, model)`` where *model* is a
        ``helia_edge.trainers.VQAutoencoder`` wrapping the three components.
    """
    downsample_factor = 2 ** num_stages
    if frame_size % downsample_factor != 0:
        raise ValueError(
            f"frame_size ({frame_size}) must be divisible by 2**num_stages ({downsample_factor})"
        )

    encoder = build_encoder_2d(
        input_len=frame_size,
        in_ch=in_ch,
        base=base_filters,
        embedding_dim=embedding_dim,
        multiplier=multiplier,
        num_stages=num_stages,
        block_norm=encoder_block_norm,
        head_norm=encoder_head_norm,
    )
    decoder = build_decoder_2d(
        output_len=frame_size,
        out_ch=out_ch,
        base=base_filters,
        embedding_dim=embedding_dim,
        multiplier=multiplier,
        num_stages=num_stages,
        decoder_block_norm=decoder_block_norm,
        head_norm=decoder_head_norm,
    )
    rvq = ResidualVectorQuantizer(
        num_levels=num_levels,
        num_embeddings=latent_width,
        embedding_dim=embedding_dim,
        beta=beta,
    )

    model = VQAutoencoder(encoder=encoder, vq=rvq, decoder=decoder, name=f"RVQAE_2D_ds{downsample_factor}")

    return encoder, rvq, decoder, model

`compressionkit.models.rvq_autoencoder.build_encoder_2d(input_len=2048, in_ch=1, base=32, embedding_dim=16, multiplier=1.25, num_stages=4, block_norm='batch', head_norm='none')` ¶

Build a configurable encoder with 2**num_stages downsampling.

Parameters:

Name	Type	Description	Default
`input_len`	`int`	Number of input time samples.	`2048`
`in_ch`	`int`	Number of input channels.	`1`
`base`	`int`	Base filter count for the first stage.	`32`
`embedding_dim`	`int`	Latent channel dimension after projection.	`16`
`multiplier`	`float`	Filter count multiplier per stage.	`1.25`
`num_stages`	`int`	Number of stride-2 downsampling stages.	`4`
`block_norm`	`str`	Normalization mode for conv blocks (`"batch"`, `"layer"`, `"none"`).	`'batch'`
`head_norm`	`str`	Normalization mode for the final projection.	`'none'`

Source code in compressionkit/models/rvq_autoencoder.py

def build_encoder_2d(
    input_len: int = 2048,
    in_ch: int = 1,
    base: int = 32,
    embedding_dim: int = 16,
    multiplier: float = 1.25,
    num_stages: int = 4,
    block_norm: str = "batch",
    head_norm: str = "none",
) -> keras.Model:
    """Build a configurable encoder with ``2**num_stages`` downsampling.

    Args:
        input_len: Number of input time samples.
        in_ch: Number of input channels.
        base: Base filter count for the first stage.
        embedding_dim: Latent channel dimension after projection.
        multiplier: Filter count multiplier per stage.
        num_stages: Number of stride-2 downsampling stages.
        block_norm: Normalization mode for conv blocks (``"batch"``, ``"layer"``, ``"none"``).
        head_norm: Normalization mode for the final projection.
    """
    downsample_factor = 2 ** num_stages
    if num_stages < 1:
        raise ValueError(f"num_stages must be >= 1, got {num_stages}")
    if input_len % downsample_factor != 0:
        raise ValueError(
            f"input_len ({input_len}) must be divisible by 2**num_stages ({downsample_factor})"
        )

    inp = keras.layers.Input(shape=(1, input_len, in_ch), name="enc_in")
    x = inp
    filters = base
    conv_stages = min(2, num_stages)
    for stage in range(conv_stages):
        x = conv2d_block(x, filters, stride_w=2, name=f"enc_s{stage + 1}", block_norm=block_norm)
        filters = make_divisible(filters * multiplier, 8)
    for stage in range(conv_stages, num_stages):
        x = depthwise2d_block(x, filters, stride_w=2, name=f"enc_s{stage + 1}", block_norm=block_norm)
        filters = make_divisible(filters * multiplier, 8)
    x = keras.layers.Conv2D(embedding_dim, (1, 1), padding="same", name="to_vq")(x)
    x = _apply_norm_2d(x, head_norm, name="enc_head_norm")
    return keras.Model(inp, x, name=f"Encoder2D_ds{downsample_factor}")

`compressionkit.models.rvq_autoencoder.build_decoder_2d(output_len=2048, out_ch=1, base=32, embedding_dim=16, multiplier=1.25, num_stages=4, decoder_block_norm='none', head_norm='layer')` ¶

Build a configurable decoder that mirrors the encoder stages.

Parameters:

Name	Type	Description	Default
`output_len`	`int`	Number of output time samples.	`2048`
`out_ch`	`int`	Number of output channels.	`1`
`base`	`int`	Base filter count (mirroring the encoder).	`32`
`embedding_dim`	`int`	Latent channel dimension.	`16`
`multiplier`	`float`	Filter count multiplier per stage.	`1.25`
`num_stages`	`int`	Number of upsample stages (must match encoder).	`4`
`decoder_block_norm`	`str`	Normalization mode for decoder blocks.	`'none'`
`head_norm`	`str`	Normalization mode for the output head.	`'layer'`

Source code in compressionkit/models/rvq_autoencoder.py

def build_decoder_2d(
    output_len: int = 2048,
    out_ch: int = 1,
    base: int = 32,
    embedding_dim: int = 16,
    multiplier: float = 1.25,
    num_stages: int = 4,
    decoder_block_norm: str = "none",
    head_norm: str = "layer",
) -> keras.Model:
    """Build a configurable decoder that mirrors the encoder stages.

    Args:
        output_len: Number of output time samples.
        out_ch: Number of output channels.
        base: Base filter count (mirroring the encoder).
        embedding_dim: Latent channel dimension.
        multiplier: Filter count multiplier per stage.
        num_stages: Number of upsample stages (must match encoder).
        decoder_block_norm: Normalization mode for decoder blocks.
        head_norm: Normalization mode for the output head.
    """
    downsample_factor = 2 ** num_stages
    if num_stages < 1:
        raise ValueError(f"num_stages must be >= 1, got {num_stages}")
    if output_len % downsample_factor != 0:
        raise ValueError(
            f"output_len ({output_len}) must be divisible by 2**num_stages ({downsample_factor})"
        )

    inp = keras.layers.Input(
        shape=(1, output_len // downsample_factor, embedding_dim),
        name="latent_in",
    )
    x = inp
    filters = make_divisible(base * (multiplier ** max(num_stages - 1, 0)), 8)
    for stage in range(num_stages):
        x = up2d_block(x, filters, name=f"dec_s{stage + 1}", block_norm=decoder_block_norm)
        filters = max(8, make_divisible(filters / multiplier, 8))
    x = _apply_norm_2d(x, head_norm, name="head_norm")
    out = keras.layers.Conv2D(out_ch, (1, 1), padding="same", name="out")(x)
    return keras.Model(inp, out, name=f"Decoder2D_ds{downsample_factor}")

`compressionkit.models.rvq_autoencoder.compute_compression_stats(frame_size, *, bit_depth, latent_width, num_levels, downsample_factor=16)` ¶

Compute compression ratio and related statistics for an RVQ configuration.

Parameters:

Name	Type	Description	Default
`frame_size`	`int`	Number of input time samples per frame.	required
`bit_depth`	`int`	Bits per raw input sample.	required
`latent_width`	`int`	Number of codebook entries (determines bits per index).	required
`num_levels`	`int`	Number of RVQ codebook levels.	required
`downsample_factor`	`int`	Total temporal downsampling factor.	`16`

Returns:

Type	Description
`dict[str, float]`	Dictionary with compression statistics.

Source code in compressionkit/models/rvq_autoencoder.py

def compute_compression_stats(
    frame_size: int,
    *,
    bit_depth: int,
    latent_width: int,
    num_levels: int,
    downsample_factor: int = 16,
) -> dict[str, float]:
    """Compute compression ratio and related statistics for an RVQ configuration.

    Args:
        frame_size: Number of input time samples per frame.
        bit_depth: Bits per raw input sample.
        latent_width: Number of codebook entries (determines bits per index).
        num_levels: Number of RVQ codebook levels.
        downsample_factor: Total temporal downsampling factor.

    Returns:
        Dictionary with compression statistics.
    """
    latent_positions = frame_size // downsample_factor
    bits_per_index = math.log2(latent_width)
    compressed_bits = latent_positions * num_levels * bits_per_index
    raw_bits = frame_size * bit_depth
    ratio = raw_bits / compressed_bits if compressed_bits else float("inf")
    return {
        "frame_size": frame_size,
        "latent_positions": latent_positions,
        "bits_per_index": bits_per_index,
        "compressed_bits_per_window": compressed_bits,
        "raw_bits_per_window": raw_bits,
        "compression_ratio": ratio,
    }

Models¶

compressionkit.models.rvq_autoencoder.build_encoder_2d(input_len=2048, in_ch=1, base=32, embedding_dim=16, multiplier=1.25, num_stages=4, block_norm='batch', head_norm='none') ¶

compressionkit.models.rvq_autoencoder.build_decoder_2d(output_len=2048, out_ch=1, base=32, embedding_dim=16, multiplier=1.25, num_stages=4, decoder_block_norm='none', head_norm='layer') ¶

compressionkit.models.rvq_autoencoder.compute_compression_stats(frame_size, *, bit_depth, latent_width, num_levels, downsample_factor=16) ¶

`compressionkit.models.rvq_autoencoder.build_encoder_2d(input_len=2048, in_ch=1, base=32, embedding_dim=16, multiplier=1.25, num_stages=4, block_norm='batch', head_norm='none')` ¶

`compressionkit.models.rvq_autoencoder.build_decoder_2d(output_len=2048, out_ch=1, base=32, embedding_dim=16, multiplier=1.25, num_stages=4, decoder_block_norm='none', head_norm='layer')` ¶

`compressionkit.models.rvq_autoencoder.compute_compression_stats(frame_size, *, bit_depth, latent_width, num_levels, downsample_factor=16)` ¶