Skip to content

Models

RVQ autoencoder model architecture using heliaEDGE components.

compressionkit.models.rvq_autoencoder.build_rvq_autoencoder(frame_size, *, embedding_dim=16, latent_width=256, in_ch=1, out_ch=1, base_filters=32, multiplier=1.25, num_levels=2, beta=0.25, num_stages=4, encoder_block_norm='batch', encoder_head_norm='none', decoder_block_norm='none', decoder_head_norm='layer')

Build encoder, RVQ bottleneck, decoder, and composite VQAutoencoder.

Returns:

Type Description
Model

(encoder, rvq, decoder, model) where model is a

ResidualVectorQuantizer

helia_edge.trainers.VQAutoencoder wrapping the three components.

Source code in compressionkit/models/rvq_autoencoder.py
def build_rvq_autoencoder(
    frame_size: int,
    *,
    embedding_dim: int = 16,
    latent_width: int = 256,
    in_ch: int = 1,
    out_ch: int = 1,
    base_filters: int = 32,
    multiplier: float = 1.25,
    num_levels: int = 2,
    beta: float = 0.25,
    num_stages: int = 4,
    encoder_block_norm: str = "batch",
    encoder_head_norm: str = "none",
    decoder_block_norm: str = "none",
    decoder_head_norm: str = "layer",
) -> tuple[keras.Model, ResidualVectorQuantizer, keras.Model, VQAutoencoder]:
    """Build encoder, RVQ bottleneck, decoder, and composite VQAutoencoder.

    Returns:
        ``(encoder, rvq, decoder, model)`` where *model* is a
        ``helia_edge.trainers.VQAutoencoder`` wrapping the three components.
    """
    downsample_factor = 2 ** num_stages
    if frame_size % downsample_factor != 0:
        raise ValueError(
            f"frame_size ({frame_size}) must be divisible by 2**num_stages ({downsample_factor})"
        )

    encoder = build_encoder_2d(
        input_len=frame_size,
        in_ch=in_ch,
        base=base_filters,
        embedding_dim=embedding_dim,
        multiplier=multiplier,
        num_stages=num_stages,
        block_norm=encoder_block_norm,
        head_norm=encoder_head_norm,
    )
    decoder = build_decoder_2d(
        output_len=frame_size,
        out_ch=out_ch,
        base=base_filters,
        embedding_dim=embedding_dim,
        multiplier=multiplier,
        num_stages=num_stages,
        decoder_block_norm=decoder_block_norm,
        head_norm=decoder_head_norm,
    )
    rvq = ResidualVectorQuantizer(
        num_levels=num_levels,
        num_embeddings=latent_width,
        embedding_dim=embedding_dim,
        beta=beta,
    )

    model = VQAutoencoder(encoder=encoder, vq=rvq, decoder=decoder, name=f"RVQAE_2D_ds{downsample_factor}")

    return encoder, rvq, decoder, model

compressionkit.models.rvq_autoencoder.build_encoder_2d(input_len=2048, in_ch=1, base=32, embedding_dim=16, multiplier=1.25, num_stages=4, block_norm='batch', head_norm='none')

Build a configurable encoder with 2**num_stages downsampling.

Parameters:

Name Type Description Default
input_len int

Number of input time samples.

2048
in_ch int

Number of input channels.

1
base int

Base filter count for the first stage.

32
embedding_dim int

Latent channel dimension after projection.

16
multiplier float

Filter count multiplier per stage.

1.25
num_stages int

Number of stride-2 downsampling stages.

4
block_norm str

Normalization mode for conv blocks ("batch", "layer", "none").

'batch'
head_norm str

Normalization mode for the final projection.

'none'
Source code in compressionkit/models/rvq_autoencoder.py
def build_encoder_2d(
    input_len: int = 2048,
    in_ch: int = 1,
    base: int = 32,
    embedding_dim: int = 16,
    multiplier: float = 1.25,
    num_stages: int = 4,
    block_norm: str = "batch",
    head_norm: str = "none",
) -> keras.Model:
    """Build a configurable encoder with ``2**num_stages`` downsampling.

    Args:
        input_len: Number of input time samples.
        in_ch: Number of input channels.
        base: Base filter count for the first stage.
        embedding_dim: Latent channel dimension after projection.
        multiplier: Filter count multiplier per stage.
        num_stages: Number of stride-2 downsampling stages.
        block_norm: Normalization mode for conv blocks (``"batch"``, ``"layer"``, ``"none"``).
        head_norm: Normalization mode for the final projection.
    """
    downsample_factor = 2 ** num_stages
    if num_stages < 1:
        raise ValueError(f"num_stages must be >= 1, got {num_stages}")
    if input_len % downsample_factor != 0:
        raise ValueError(
            f"input_len ({input_len}) must be divisible by 2**num_stages ({downsample_factor})"
        )

    inp = keras.layers.Input(shape=(1, input_len, in_ch), name="enc_in")
    x = inp
    filters = base
    conv_stages = min(2, num_stages)
    for stage in range(conv_stages):
        x = conv2d_block(x, filters, stride_w=2, name=f"enc_s{stage + 1}", block_norm=block_norm)
        filters = make_divisible(filters * multiplier, 8)
    for stage in range(conv_stages, num_stages):
        x = depthwise2d_block(x, filters, stride_w=2, name=f"enc_s{stage + 1}", block_norm=block_norm)
        filters = make_divisible(filters * multiplier, 8)
    x = keras.layers.Conv2D(embedding_dim, (1, 1), padding="same", name="to_vq")(x)
    x = _apply_norm_2d(x, head_norm, name="enc_head_norm")
    return keras.Model(inp, x, name=f"Encoder2D_ds{downsample_factor}")

compressionkit.models.rvq_autoencoder.build_decoder_2d(output_len=2048, out_ch=1, base=32, embedding_dim=16, multiplier=1.25, num_stages=4, decoder_block_norm='none', head_norm='layer')

Build a configurable decoder that mirrors the encoder stages.

Parameters:

Name Type Description Default
output_len int

Number of output time samples.

2048
out_ch int

Number of output channels.

1
base int

Base filter count (mirroring the encoder).

32
embedding_dim int

Latent channel dimension.

16
multiplier float

Filter count multiplier per stage.

1.25
num_stages int

Number of upsample stages (must match encoder).

4
decoder_block_norm str

Normalization mode for decoder blocks.

'none'
head_norm str

Normalization mode for the output head.

'layer'
Source code in compressionkit/models/rvq_autoencoder.py
def build_decoder_2d(
    output_len: int = 2048,
    out_ch: int = 1,
    base: int = 32,
    embedding_dim: int = 16,
    multiplier: float = 1.25,
    num_stages: int = 4,
    decoder_block_norm: str = "none",
    head_norm: str = "layer",
) -> keras.Model:
    """Build a configurable decoder that mirrors the encoder stages.

    Args:
        output_len: Number of output time samples.
        out_ch: Number of output channels.
        base: Base filter count (mirroring the encoder).
        embedding_dim: Latent channel dimension.
        multiplier: Filter count multiplier per stage.
        num_stages: Number of upsample stages (must match encoder).
        decoder_block_norm: Normalization mode for decoder blocks.
        head_norm: Normalization mode for the output head.
    """
    downsample_factor = 2 ** num_stages
    if num_stages < 1:
        raise ValueError(f"num_stages must be >= 1, got {num_stages}")
    if output_len % downsample_factor != 0:
        raise ValueError(
            f"output_len ({output_len}) must be divisible by 2**num_stages ({downsample_factor})"
        )

    inp = keras.layers.Input(
        shape=(1, output_len // downsample_factor, embedding_dim),
        name="latent_in",
    )
    x = inp
    filters = make_divisible(base * (multiplier ** max(num_stages - 1, 0)), 8)
    for stage in range(num_stages):
        x = up2d_block(x, filters, name=f"dec_s{stage + 1}", block_norm=decoder_block_norm)
        filters = max(8, make_divisible(filters / multiplier, 8))
    x = _apply_norm_2d(x, head_norm, name="head_norm")
    out = keras.layers.Conv2D(out_ch, (1, 1), padding="same", name="out")(x)
    return keras.Model(inp, out, name=f"Decoder2D_ds{downsample_factor}")

compressionkit.models.rvq_autoencoder.compute_compression_stats(frame_size, *, bit_depth, latent_width, num_levels, downsample_factor=16)

Compute compression ratio and related statistics for an RVQ configuration.

Parameters:

Name Type Description Default
frame_size int

Number of input time samples per frame.

required
bit_depth int

Bits per raw input sample.

required
latent_width int

Number of codebook entries (determines bits per index).

required
num_levels int

Number of RVQ codebook levels.

required
downsample_factor int

Total temporal downsampling factor.

16

Returns:

Type Description
dict[str, float]

Dictionary with compression statistics.

Source code in compressionkit/models/rvq_autoencoder.py
def compute_compression_stats(
    frame_size: int,
    *,
    bit_depth: int,
    latent_width: int,
    num_levels: int,
    downsample_factor: int = 16,
) -> dict[str, float]:
    """Compute compression ratio and related statistics for an RVQ configuration.

    Args:
        frame_size: Number of input time samples per frame.
        bit_depth: Bits per raw input sample.
        latent_width: Number of codebook entries (determines bits per index).
        num_levels: Number of RVQ codebook levels.
        downsample_factor: Total temporal downsampling factor.

    Returns:
        Dictionary with compression statistics.
    """
    latent_positions = frame_size // downsample_factor
    bits_per_index = math.log2(latent_width)
    compressed_bits = latent_positions * num_levels * bits_per_index
    raw_bits = frame_size * bit_depth
    ratio = raw_bits / compressed_bits if compressed_bits else float("inf")
    return {
        "frame_size": frame_size,
        "latent_positions": latent_positions,
        "bits_per_index": bits_per_index,
        "compressed_bits_per_window": compressed_bits,
        "raw_bits_per_window": raw_bits,
        "compression_ratio": ratio,
    }