文章/答案/技术大牛

发布

社区首页 >问答首页 >极简化Golomb-Rice编码器

问极简化Golomb-Rice编码器
EN

Code Review用户

提问于 2020-02-10 05:55:38

回答 1查看 153关注 0票数 5

我实现了一个非常简单但健壮的白米编码实现。要理解我的动机，请参阅基于此的极简数据压缩器。目前，该实现运行良好。然而，我打算使它更优雅，简洁，并可能更快。欢迎任何关于改进或建设性批评的建议。

先决条件

typedef名称uint32指定宽度正好为32位的无符号整数类型。
uchar名称是unsigned char的别名。
函数size_t minsize(size_t a, size_t b)返回a和b的最小值。
函数size_t ctzu32(uint32 n)返回n中的尾随0位数，从最小有效位位置开始。如果n为0，则结果为32。

码

该实现使用以下数据结构：

enum {
    BIO_MODE_READ,
    BIO_MODE_WRITE
};

struct bio {
    int mode;   /* reading or writing? */
    uchar *ptr; /* pointer to memory */
    uint32 b;   /* bit buffer */
    size_t c;   /* bit counter */
};

此外，我使用了几个辅助函数(它有点长)：

static void bio_reset_after_flush(struct bio *bio)
{
    assert(bio != NULL);

    bio->b = 0;
    bio->c = 0;
}

static void bio_open(struct bio *bio, uchar *ptr, int mode)
{
    assert(bio != NULL);
    assert(ptr != NULL);

    bio->mode = mode;
    bio->ptr = ptr;

    switch (mode) {
        case BIO_MODE_READ:
            bio->c = 32;
            break;
        case BIO_MODE_WRITE:
            bio_reset_after_flush(bio);
            break;
    }
}

static void bio_flush_buffer(struct bio *bio)
{
    assert(bio != NULL);
    assert(bio->ptr != NULL);
    assert(sizeof(uint32) * CHAR_BIT == 32);

    *((uint32 *)bio->ptr) = bio->b;

    bio->ptr += 4;
}

static void bio_reload_buffer(struct bio *bio)
{
    assert(bio != NULL);
    assert(bio->ptr != NULL);

    bio->b = *(uint32 *)bio->ptr;

    bio->ptr += 4;
}

static void bio_put_nonzero_bit(struct bio *bio)
{
    assert(bio != NULL);
    assert(bio->c < 32);

    bio->b |= (uint32)1 << bio->c;

    bio->c++;

    if (bio->c == 32) {
        bio_flush_buffer(bio);
        bio_reset_after_flush(bio);
    }
}

static void bio_write_bits(struct bio *bio, uint32 b, size_t n)
{
    assert(n <= 32);

    while (n > 0) {
        size_t m;

        assert(bio->c < 32);

        m = minsize(32 - bio->c, n);

        assert(32 >= bio->c + m);

        bio->b |= (uint32)((b & (((uint32)1 << m) - 1)) << bio->c);

        bio->c += m;

        if (bio->c == 32) {
            bio_flush_buffer(bio);
            bio_reset_after_flush(bio);
        }

        b >>= m;
        n -= m;
    }
}

static void bio_write_zero_bits(struct bio *bio, size_t n)
{
    assert(n <= 32);

    while (n > 0) {
        size_t m;

        assert(bio->c < 32);

        m = minsize(32 - bio->c, n);

        assert(32 >= bio->c + m);

        bio->c += m;

        if (bio->c == 32) {
            bio_flush_buffer(bio);
            bio_reset_after_flush(bio);
        }

        n -= m;
    }
}

static uint32 bio_read_bits(struct bio *bio, size_t n)
{
    uint32 w;
    size_t s;

    /* reload? */
    if (bio->c == 32) {
        bio_reload_buffer(bio);
        bio->c = 0;
    }

    /* get the avail. least-significant bits */
    s = minsize(32 - bio->c, n);

    w = bio->b & (((uint32)1 << s) - 1);

    bio->b >>= s;
    bio->c += s;

    n -= s;

    /* need more bits? reload & get the most-significant bits */
    if (n > 0) {
        assert(bio->c == 32);

        bio_reload_buffer(bio);
        bio->c = 0;

        w |= (bio->b & (((uint32)1 << n) - 1)) << s;

        bio->b >>= n;
        bio->c += n;
    }

    return w;
}

static void bio_close(struct bio *bio)
{
    assert(bio != NULL);

    if (bio->mode == BIO_MODE_WRITE && bio->c > 0) {
        bio_flush_buffer(bio);
    }
}

static void bio_write_unary(struct bio *bio, uint32 N)
{
    while (N > 32) {
        bio_write_zero_bits(bio, 32);

        N -= 32;
    }

    bio_write_zero_bits(bio, N);

    bio_put_nonzero_bit(bio);
}

static uint32 bio_read_unary(struct bio *bio)
{
    /* get zeros... */
    uint32 total_zeros = 0;

    assert(bio != NULL);

    do {
        size_t s;

        /* reload? */
        if (bio->c == 32) {
            bio_reload_buffer(bio);
            bio->c = 0;
        }

        /* get trailing zeros */
        s = minsize(32 - bio->c, ctzu32(bio->b));

        bio->b >>= s;
        bio->c += s;

        total_zeros += s;
    } while (bio->c == 32);

    /* ...and drop non-zero bit */
    assert(bio->c < 32);

    bio->b >>= 1;
    bio->c++;

    return total_zeros;
}

最后，主要输入函数(使用参数N编码/解码非负整数2^k)定义如下：

void bio_write_gr(struct bio *bio, size_t k, uint32 N)
{
    uint32 Q = N >> k;

    bio_write_unary(bio, Q);

    assert(k <= 32);

    bio_write_bits(bio, N, k);
}

uint32 bio_read_gr(struct bio *bio, size_t k)
{
    uint32 Q;
    uint32 N;

    Q = bio_read_unary(bio);

    N = Q << k;

    assert(k <= 32);

    N |= bio_read_bits(bio, k);

    return N;
}

performance

compression

回答 1

Code Review用户

回答已采纳

发布于 2020-02-12 19:49:02

将模式枚举命名为

虽然enums在C中没有强类型，但假装它们是更优雅的。给出声明类型的enum，并在struct bio中使用它，如下所示：

enum bio_mode {
    BIO_MODE_READ,
    BIO_MODE_WRITE,
};

struct bio {
    enum bio_mode mode;
    ...
};

编译器可以使用这些信息，例如，如果您编写了一条switch (mode) {...}语句，而您忘记处理所有可能的模式，编译器将对此发出警告。

还将将int mode作为参数的函数更改为enum bio_mode mode。

在可能的地方使用标准类型(

)

使用中的标准固定宽度整数类型，而不是自己的名称。所以，不要使用uint32，而是使用uint32_t，而不是使用uchar，而是使用uint8_t。

不需要assert()，uint32_t的大小是32位。

将`struct bio`重新排序为更紧凑的

在大多数64位架构中，struct bio的布局不太理想，因为指针和size_t有64位对齐，而ints有32位对齐。我建议如下：

struct bio {
    enum bio_mode mode;
    uint32_t b;
    uint8_t *ptr;
    size_t c;  
};

Make `ptr` `uint32_t *`

由于您在许多地方将ptr转换为uint32_t *，因此将其直接存储为该类型并在bio_open()中转换一次更有意义。我还建议您在void *中使用bio_open()，因此调用方不需要执行任何强制转换。

struct bio {
    enum bio_mode mode;
    uint32_t b;
    uint32_t *ptr;
    size_t c;  
};

static void bio_open(struct bio *bio, void *ptr, int mode)
{
    ...
    bio->ptr = ptr;
    ...
}

请记住，也要将所有出现的bio->ptr += 4更改为bio->ptr++。

断言`ptr`是32位对齐的

。

只有当指针对齐32位时，才能将指针转换为uint32_t *。在某些体系结构中，不允许通过未正确对齐的指针访问内存。在这样做的情况下，它的效率可能低于使指针正确地对齐。要断言这一点，请写：

assert(((uintptr_t)ptr & 3) == 0);

另一种选择是允许ptr在对bio_open()的调用中对齐，但随后初始化bio->b，使其包含到前32位对齐地址的前几个字节，当然也可以相应地设置bio->c。

断言在`bio_read_()`和`bio_write_()`

中设置了正确的模式

为了避免意外地重用struct bio，或者在同一个bio上混合读写调用，在read函数中使用assert(bio->mode == BIO_MODE_READ)等等。

优化`bio_write_bits()`

在bio_write_bits()中有很多东西可以优化。首先，有很多不必要的铸造进行。虽然它不会改变实际的二进制文件，但它会清理源代码以删除它们，并使查看实际方程变得更容易。例如，您可以只写：

bio->b |= (b & ((1 << m) - 1)) << bio->c;

在上面的代码中，您是在通过b将其转换为bio->c之前掩盖其较低的位元。然而，这是完全没有必要的，因为要么这些高位元一开始就为零，要么它们就会被移除。所以你可以写：

bio->b |= b << bio->c;

更重要的是，您已经将此函数编写为一个循环，但您最多只能进行两次循环:所有n位都适合bio->b，或者您必须刷新一次并将其余的位放入其中。您可以按以下方式重写代码：

static void bio_write_bits(struct bio *bio, uint32_t b, size_t n)
{
    assert(n <= 32);
    assert((b >> n) == 0);
    assert(bio->c < 32);


    bio->b |= b << bio->c;
    bio->c += n;

    /* Exit early if we didn't fill bio->b yet */
    if (bio->c < 32)
        return;

    bio_flush_buffer(bio);

    /* Store the remaining bits */
    bio->c -= 32;
    bio->b = b >> (n - bio->c);
}

对于bio_write_zero_bits()也可以进行类似的优化。

重置`ptr`在`bio_close()`

中的应用

若要在调用struct bio后捕获潜在的使用bio_close()，请在bio_close()中设置bio->ptr = NULL。

验证输入

在bio_read_unary()中，有一个循环读取零位。如果输入格式错误并且只包含零位，怎么办？在使用完整个输入之后，bio_read_unary()将在输入结束后继续读取。

首先，您可以通过假设最多需要进行两次迭代来摆脱循环，就像在bio_write_bits()中一样。其次，最好在struct bio中有一个额外的字段，它要么是缓冲区中剩余的大小，要么是结束指针，并跟踪您的读写量。不要使用assert()来检查您是否通过了结束，而是使用一个实际的if-statement，并返回一个错误，或者至少在输入无效时调用abort()。

票数 3

页面原文内容由Code Review提供。腾讯云小微IT领域专用引擎提供翻译支持

原文链接：

https://codereview.stackexchange.com/questions/236965

复制

相似问题

问极简化Golomb-Rice编码器
EN

先决条件

码

回答 1

Code Review用户

将模式枚举命名为

在可能的地方使用标准类型(

将`struct bio`重新排序为更紧凑的

Make `ptr` `uint32_t *`

断言`ptr`是32位对齐的

断言在`bio_read_()`和`bio_write_()`

优化`bio_write_bits()`

重置`ptr`在`bio_close()`

验证输入

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问极简化Golomb-Rice编码器EN

先决条件

码

回答 1

Code Review用户

将模式枚举命名为

在可能的地方使用标准类型(

将struct bio重新排序为更紧凑的

Make ptr uint32_t *

断言ptr是32位对齐的

断言在bio_read_*()和bio_write_*()

优化bio_write_bits()

重置ptr在bio_close()

验证输入

社区

活动

圈层

关于

腾讯云开发者

热门产品

热门推荐

更多推荐

问极简化Golomb-Rice编码器
EN

将`struct bio`重新排序为更紧凑的

Make `ptr` `uint32_t *`

断言`ptr`是32位对齐的

断言在`bio_read_()`和`bio_write_()`

优化`bio_write_bits()`

重置`ptr`在`bio_close()`