drbd_vli.h (10891B)
1/* SPDX-License-Identifier: GPL-2.0-or-later */ 2/* 3-*- linux-c -*- 4 drbd_receiver.c 5 This file is part of DRBD by Philipp Reisner and Lars Ellenberg. 6 7 Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. 8 Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. 9 Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. 10 11 */ 12 13#ifndef _DRBD_VLI_H 14#define _DRBD_VLI_H 15 16/* 17 * At a granularity of 4KiB storage represented per bit, 18 * and stroage sizes of several TiB, 19 * and possibly small-bandwidth replication, 20 * the bitmap transfer time can take much too long, 21 * if transmitted in plain text. 22 * 23 * We try to reduce the transferred bitmap information 24 * by encoding runlengths of bit polarity. 25 * 26 * We never actually need to encode a "zero" (runlengths are positive). 27 * But then we have to store the value of the first bit. 28 * The first bit of information thus shall encode if the first runlength 29 * gives the number of set or unset bits. 30 * 31 * We assume that large areas are either completely set or unset, 32 * which gives good compression with any runlength method, 33 * even when encoding the runlength as fixed size 32bit/64bit integers. 34 * 35 * Still, there may be areas where the polarity flips every few bits, 36 * and encoding the runlength sequence of those areas with fix size 37 * integers would be much worse than plaintext. 38 * 39 * We want to encode small runlength values with minimum code length, 40 * while still being able to encode a Huge run of all zeros. 41 * 42 * Thus we need a Variable Length Integer encoding, VLI. 43 * 44 * For some cases, we produce more code bits than plaintext input. 45 * We need to send incompressible chunks as plaintext, skip over them 46 * and then see if the next chunk compresses better. 47 * 48 * We don't care too much about "excellent" compression ratio for large 49 * runlengths (all set/all clear): whether we achieve a factor of 100 50 * or 1000 is not that much of an issue. 51 * We do not want to waste too much on short runlengths in the "noisy" 52 * parts of the bitmap, though. 53 * 54 * There are endless variants of VLI, we experimented with: 55 * * simple byte-based 56 * * various bit based with different code word length. 57 * 58 * To avoid yet an other configuration parameter (choice of bitmap compression 59 * algorithm) which was difficult to explain and tune, we just chose the one 60 * variant that turned out best in all test cases. 61 * Based on real world usage patterns, with device sizes ranging from a few GiB 62 * to several TiB, file server/mailserver/webserver/mysql/postgress, 63 * mostly idle to really busy, the all time winner (though sometimes only 64 * marginally better) is: 65 */ 66 67/* 68 * encoding is "visualised" as 69 * __little endian__ bitstream, least significant bit first (left most) 70 * 71 * this particular encoding is chosen so that the prefix code 72 * starts as unary encoding the level, then modified so that 73 * 10 levels can be described in 8bit, with minimal overhead 74 * for the smaller levels. 75 * 76 * Number of data bits follow fibonacci sequence, with the exception of the 77 * last level (+1 data bit, so it makes 64bit total). The only worse code when 78 * encoding bit polarity runlength is 1 plain bits => 2 code bits. 79prefix data bits max val NÂș data bits 800 x 0x2 1 8110 x 0x4 1 82110 xx 0x8 2 831110 xxx 0x10 3 8411110 xxx xx 0x30 5 85111110 xx xxxxxx 0x130 8 8611111100 xxxxxxxx xxxxx 0x2130 13 8711111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 8811111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 8911111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 90 * maximum encodable value: 0x100000400202130 == 2**56 + some */ 91 92/* compression "table": 93 transmitted x 0.29 94 as plaintext x ........................ 95 x ........................ 96 x ........................ 97 x 0.59 0.21........................ 98 x ........................................................ 99 x .. c ................................................... 100 x 0.44.. o ................................................... 101 x .......... d ................................................... 102 x .......... e ................................................... 103 X............. ................................................... 104 x.............. b ................................................... 1052.0x............... i ................................................... 106 #X................ t ................................................... 107 #................. s ........................... plain bits .......... 108-+----------------------------------------------------------------------- 109 1 16 32 64 110*/ 111 112/* LEVEL: (total bits, prefix bits, prefix value), 113 * sorted ascending by number of total bits. 114 * The rest of the code table is calculated at compiletime from this. */ 115 116/* fibonacci data 1, 1, ... */ 117#define VLI_L_1_1() do { \ 118 LEVEL( 2, 1, 0x00); \ 119 LEVEL( 3, 2, 0x01); \ 120 LEVEL( 5, 3, 0x03); \ 121 LEVEL( 7, 4, 0x07); \ 122 LEVEL(10, 5, 0x0f); \ 123 LEVEL(14, 6, 0x1f); \ 124 LEVEL(21, 8, 0x3f); \ 125 LEVEL(29, 8, 0x7f); \ 126 LEVEL(42, 8, 0xbf); \ 127 LEVEL(64, 8, 0xff); \ 128 } while (0) 129 130/* finds a suitable level to decode the least significant part of in. 131 * returns number of bits consumed. 132 * 133 * BUG() for bad input, as that would mean a buggy code table. */ 134static inline int vli_decode_bits(u64 *out, const u64 in) 135{ 136 u64 adj = 1; 137 138#define LEVEL(t,b,v) \ 139 do { \ 140 if ((in & ((1 << b) -1)) == v) { \ 141 *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ 142 return t; \ 143 } \ 144 adj += 1ULL << (t - b); \ 145 } while (0) 146 147 VLI_L_1_1(); 148 149 /* NOT REACHED, if VLI_LEVELS code table is defined properly */ 150 BUG(); 151#undef LEVEL 152} 153 154/* return number of code bits needed, 155 * or negative error number */ 156static inline int __vli_encode_bits(u64 *out, const u64 in) 157{ 158 u64 max = 0; 159 u64 adj = 1; 160 161 if (in == 0) 162 return -EINVAL; 163 164#define LEVEL(t,b,v) do { \ 165 max += 1ULL << (t - b); \ 166 if (in <= max) { \ 167 if (out) \ 168 *out = ((in - adj) << b) | v; \ 169 return t; \ 170 } \ 171 adj = max + 1; \ 172 } while (0) 173 174 VLI_L_1_1(); 175 176 return -EOVERFLOW; 177#undef LEVEL 178} 179 180#undef VLI_L_1_1 181 182/* code from here down is independend of actually used bit code */ 183 184/* 185 * Code length is determined by some unique (e.g. unary) prefix. 186 * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, 187 * not a byte stream. 188 */ 189 190/* for the bitstream, we need a cursor */ 191struct bitstream_cursor { 192 /* the current byte */ 193 u8 *b; 194 /* the current bit within *b, nomalized: 0..7 */ 195 unsigned int bit; 196}; 197 198/* initialize cursor to point to first bit of stream */ 199static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) 200{ 201 cur->b = s; 202 cur->bit = 0; 203} 204 205/* advance cursor by that many bits; maximum expected input value: 64, 206 * but depending on VLI implementation, it may be more. */ 207static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) 208{ 209 bits += cur->bit; 210 cur->b = cur->b + (bits >> 3); 211 cur->bit = bits & 7; 212} 213 214/* the bitstream itself knows its length */ 215struct bitstream { 216 struct bitstream_cursor cur; 217 unsigned char *buf; 218 size_t buf_len; /* in bytes */ 219 220 /* for input stream: 221 * number of trailing 0 bits for padding 222 * total number of valid bits in stream: buf_len * 8 - pad_bits */ 223 unsigned int pad_bits; 224}; 225 226static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) 227{ 228 bs->buf = s; 229 bs->buf_len = len; 230 bs->pad_bits = pad_bits; 231 bitstream_cursor_reset(&bs->cur, bs->buf); 232} 233 234static inline void bitstream_rewind(struct bitstream *bs) 235{ 236 bitstream_cursor_reset(&bs->cur, bs->buf); 237 memset(bs->buf, 0, bs->buf_len); 238} 239 240/* Put (at most 64) least significant bits of val into bitstream, and advance cursor. 241 * Ignores "pad_bits". 242 * Returns zero if bits == 0 (nothing to do). 243 * Returns number of bits used if successful. 244 * 245 * If there is not enough room left in bitstream, 246 * leaves bitstream unchanged and returns -ENOBUFS. 247 */ 248static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) 249{ 250 unsigned char *b = bs->cur.b; 251 unsigned int tmp; 252 253 if (bits == 0) 254 return 0; 255 256 if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) 257 return -ENOBUFS; 258 259 /* paranoia: strip off hi bits; they should not be set anyways. */ 260 if (bits < 64) 261 val &= ~0ULL >> (64 - bits); 262 263 *b++ |= (val & 0xff) << bs->cur.bit; 264 265 for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) 266 *b++ |= (val >> tmp) & 0xff; 267 268 bitstream_cursor_advance(&bs->cur, bits); 269 return bits; 270} 271 272/* Fetch (at most 64) bits from bitstream into *out, and advance cursor. 273 * 274 * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. 275 * 276 * If there are less than the requested number of valid bits left in the 277 * bitstream, still fetches all available bits. 278 * 279 * Returns number of actually fetched bits. 280 */ 281static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) 282{ 283 u64 val; 284 unsigned int n; 285 286 if (bits > 64) 287 return -EINVAL; 288 289 if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) 290 bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) 291 - bs->cur.bit - bs->pad_bits; 292 293 if (bits == 0) { 294 *out = 0; 295 return 0; 296 } 297 298 /* get the high bits */ 299 val = 0; 300 n = (bs->cur.bit + bits + 7) >> 3; 301 /* n may be at most 9, if cur.bit + bits > 64 */ 302 /* which means this copies at most 8 byte */ 303 if (n) { 304 memcpy(&val, bs->cur.b+1, n - 1); 305 val = le64_to_cpu(val) << (8 - bs->cur.bit); 306 } 307 308 /* we still need the low bits */ 309 val |= bs->cur.b[0] >> bs->cur.bit; 310 311 /* and mask out bits we don't want */ 312 val &= ~0ULL >> (64 - bits); 313 314 bitstream_cursor_advance(&bs->cur, bits); 315 *out = val; 316 317 return bits; 318} 319 320/* encodes @in as vli into @bs; 321 322 * return values 323 * > 0: number of bits successfully stored in bitstream 324 * -ENOBUFS @bs is full 325 * -EINVAL input zero (invalid) 326 * -EOVERFLOW input too large for this vli code (invalid) 327 */ 328static inline int vli_encode_bits(struct bitstream *bs, u64 in) 329{ 330 u64 code = code; 331 int bits = __vli_encode_bits(&code, in); 332 333 if (bits <= 0) 334 return bits; 335 336 return bitstream_put_bits(bs, code, bits); 337} 338 339#endif