Commit | Line | Data |
---|---|---|
b411b363 PR |
1 | /* |
2 | -*- linux-c -*- | |
3 | drbd_receiver.c | |
4 | This file is part of DRBD by Philipp Reisner and Lars Ellenberg. | |
5 | ||
6 | Copyright (C) 2001-2008, LINBIT Information Technologies GmbH. | |
7 | Copyright (C) 1999-2008, Philipp Reisner <philipp.reisner@linbit.com>. | |
8 | Copyright (C) 2002-2008, Lars Ellenberg <lars.ellenberg@linbit.com>. | |
9 | ||
10 | drbd is free software; you can redistribute it and/or modify | |
11 | it under the terms of the GNU General Public License as published by | |
12 | the Free Software Foundation; either version 2, or (at your option) | |
13 | any later version. | |
14 | ||
15 | drbd is distributed in the hope that it will be useful, | |
16 | but WITHOUT ANY WARRANTY; without even the implied warranty of | |
17 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the | |
18 | GNU General Public License for more details. | |
19 | ||
20 | You should have received a copy of the GNU General Public License | |
21 | along with drbd; see the file COPYING. If not, write to | |
22 | the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, USA. | |
23 | */ | |
24 | ||
25 | #ifndef _DRBD_VLI_H | |
26 | #define _DRBD_VLI_H | |
27 | ||
28 | /* | |
29 | * At a granularity of 4KiB storage represented per bit, | |
30 | * and stroage sizes of several TiB, | |
31 | * and possibly small-bandwidth replication, | |
32 | * the bitmap transfer time can take much too long, | |
33 | * if transmitted in plain text. | |
34 | * | |
25985edc | 35 | * We try to reduce the transferred bitmap information |
b411b363 PR |
36 | * by encoding runlengths of bit polarity. |
37 | * | |
38 | * We never actually need to encode a "zero" (runlengths are positive). | |
39 | * But then we have to store the value of the first bit. | |
40 | * The first bit of information thus shall encode if the first runlength | |
41 | * gives the number of set or unset bits. | |
42 | * | |
43 | * We assume that large areas are either completely set or unset, | |
44 | * which gives good compression with any runlength method, | |
45 | * even when encoding the runlength as fixed size 32bit/64bit integers. | |
46 | * | |
47 | * Still, there may be areas where the polarity flips every few bits, | |
48 | * and encoding the runlength sequence of those areas with fix size | |
49 | * integers would be much worse than plaintext. | |
50 | * | |
51 | * We want to encode small runlength values with minimum code length, | |
52 | * while still being able to encode a Huge run of all zeros. | |
53 | * | |
54 | * Thus we need a Variable Length Integer encoding, VLI. | |
55 | * | |
56 | * For some cases, we produce more code bits than plaintext input. | |
57 | * We need to send incompressible chunks as plaintext, skip over them | |
58 | * and then see if the next chunk compresses better. | |
59 | * | |
60 | * We don't care too much about "excellent" compression ratio for large | |
61 | * runlengths (all set/all clear): whether we achieve a factor of 100 | |
62 | * or 1000 is not that much of an issue. | |
63 | * We do not want to waste too much on short runlengths in the "noisy" | |
64 | * parts of the bitmap, though. | |
65 | * | |
66 | * There are endless variants of VLI, we experimented with: | |
67 | * * simple byte-based | |
68 | * * various bit based with different code word length. | |
69 | * | |
70 | * To avoid yet an other configuration parameter (choice of bitmap compression | |
71 | * algorithm) which was difficult to explain and tune, we just chose the one | |
72 | * variant that turned out best in all test cases. | |
73 | * Based on real world usage patterns, with device sizes ranging from a few GiB | |
74 | * to several TiB, file server/mailserver/webserver/mysql/postgress, | |
75 | * mostly idle to really busy, the all time winner (though sometimes only | |
76 | * marginally better) is: | |
77 | */ | |
78 | ||
79 | /* | |
80 | * encoding is "visualised" as | |
81 | * __little endian__ bitstream, least significant bit first (left most) | |
82 | * | |
83 | * this particular encoding is chosen so that the prefix code | |
84 | * starts as unary encoding the level, then modified so that | |
85 | * 10 levels can be described in 8bit, with minimal overhead | |
86 | * for the smaller levels. | |
87 | * | |
88 | * Number of data bits follow fibonacci sequence, with the exception of the | |
89 | * last level (+1 data bit, so it makes 64bit total). The only worse code when | |
90 | * encoding bit polarity runlength is 1 plain bits => 2 code bits. | |
91 | prefix data bits max val NÂș data bits | |
92 | 0 x 0x2 1 | |
93 | 10 x 0x4 1 | |
94 | 110 xx 0x8 2 | |
95 | 1110 xxx 0x10 3 | |
96 | 11110 xxx xx 0x30 5 | |
97 | 111110 xx xxxxxx 0x130 8 | |
98 | 11111100 xxxxxxxx xxxxx 0x2130 13 | |
99 | 11111110 xxxxxxxx xxxxxxxx xxxxx 0x202130 21 | |
100 | 11111101 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xx 0x400202130 34 | |
101 | 11111111 xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx xxxxxxxx 56 | |
102 | * maximum encodable value: 0x100000400202130 == 2**56 + some */ | |
103 | ||
104 | /* compression "table": | |
105 | transmitted x 0.29 | |
106 | as plaintext x ........................ | |
107 | x ........................ | |
108 | x ........................ | |
109 | x 0.59 0.21........................ | |
110 | x ........................................................ | |
111 | x .. c ................................................... | |
112 | x 0.44.. o ................................................... | |
113 | x .......... d ................................................... | |
114 | x .......... e ................................................... | |
115 | X............. ................................................... | |
116 | x.............. b ................................................... | |
117 | 2.0x............... i ................................................... | |
118 | #X................ t ................................................... | |
119 | #................. s ........................... plain bits .......... | |
120 | -+----------------------------------------------------------------------- | |
121 | 1 16 32 64 | |
122 | */ | |
123 | ||
124 | /* LEVEL: (total bits, prefix bits, prefix value), | |
125 | * sorted ascending by number of total bits. | |
126 | * The rest of the code table is calculated at compiletime from this. */ | |
127 | ||
128 | /* fibonacci data 1, 1, ... */ | |
129 | #define VLI_L_1_1() do { \ | |
130 | LEVEL( 2, 1, 0x00); \ | |
131 | LEVEL( 3, 2, 0x01); \ | |
132 | LEVEL( 5, 3, 0x03); \ | |
133 | LEVEL( 7, 4, 0x07); \ | |
134 | LEVEL(10, 5, 0x0f); \ | |
135 | LEVEL(14, 6, 0x1f); \ | |
136 | LEVEL(21, 8, 0x3f); \ | |
137 | LEVEL(29, 8, 0x7f); \ | |
138 | LEVEL(42, 8, 0xbf); \ | |
139 | LEVEL(64, 8, 0xff); \ | |
140 | } while (0) | |
141 | ||
142 | /* finds a suitable level to decode the least significant part of in. | |
143 | * returns number of bits consumed. | |
144 | * | |
145 | * BUG() for bad input, as that would mean a buggy code table. */ | |
146 | static inline int vli_decode_bits(u64 *out, const u64 in) | |
147 | { | |
148 | u64 adj = 1; | |
149 | ||
150 | #define LEVEL(t,b,v) \ | |
151 | do { \ | |
152 | if ((in & ((1 << b) -1)) == v) { \ | |
153 | *out = ((in & ((~0ULL) >> (64-t))) >> b) + adj; \ | |
154 | return t; \ | |
155 | } \ | |
156 | adj += 1ULL << (t - b); \ | |
157 | } while (0) | |
158 | ||
159 | VLI_L_1_1(); | |
160 | ||
161 | /* NOT REACHED, if VLI_LEVELS code table is defined properly */ | |
162 | BUG(); | |
163 | #undef LEVEL | |
164 | } | |
165 | ||
166 | /* return number of code bits needed, | |
167 | * or negative error number */ | |
168 | static inline int __vli_encode_bits(u64 *out, const u64 in) | |
169 | { | |
170 | u64 max = 0; | |
171 | u64 adj = 1; | |
172 | ||
173 | if (in == 0) | |
174 | return -EINVAL; | |
175 | ||
176 | #define LEVEL(t,b,v) do { \ | |
177 | max += 1ULL << (t - b); \ | |
178 | if (in <= max) { \ | |
179 | if (out) \ | |
180 | *out = ((in - adj) << b) | v; \ | |
181 | return t; \ | |
182 | } \ | |
183 | adj = max + 1; \ | |
184 | } while (0) | |
185 | ||
186 | VLI_L_1_1(); | |
187 | ||
188 | return -EOVERFLOW; | |
189 | #undef LEVEL | |
190 | } | |
191 | ||
192 | #undef VLI_L_1_1 | |
193 | ||
194 | /* code from here down is independend of actually used bit code */ | |
195 | ||
196 | /* | |
197 | * Code length is determined by some unique (e.g. unary) prefix. | |
198 | * This encodes arbitrary bit length, not whole bytes: we have a bit-stream, | |
199 | * not a byte stream. | |
200 | */ | |
201 | ||
202 | /* for the bitstream, we need a cursor */ | |
203 | struct bitstream_cursor { | |
204 | /* the current byte */ | |
205 | u8 *b; | |
206 | /* the current bit within *b, nomalized: 0..7 */ | |
207 | unsigned int bit; | |
208 | }; | |
209 | ||
210 | /* initialize cursor to point to first bit of stream */ | |
211 | static inline void bitstream_cursor_reset(struct bitstream_cursor *cur, void *s) | |
212 | { | |
213 | cur->b = s; | |
214 | cur->bit = 0; | |
215 | } | |
216 | ||
217 | /* advance cursor by that many bits; maximum expected input value: 64, | |
218 | * but depending on VLI implementation, it may be more. */ | |
219 | static inline void bitstream_cursor_advance(struct bitstream_cursor *cur, unsigned int bits) | |
220 | { | |
221 | bits += cur->bit; | |
222 | cur->b = cur->b + (bits >> 3); | |
223 | cur->bit = bits & 7; | |
224 | } | |
225 | ||
226 | /* the bitstream itself knows its length */ | |
227 | struct bitstream { | |
228 | struct bitstream_cursor cur; | |
229 | unsigned char *buf; | |
230 | size_t buf_len; /* in bytes */ | |
231 | ||
232 | /* for input stream: | |
233 | * number of trailing 0 bits for padding | |
234 | * total number of valid bits in stream: buf_len * 8 - pad_bits */ | |
235 | unsigned int pad_bits; | |
236 | }; | |
237 | ||
238 | static inline void bitstream_init(struct bitstream *bs, void *s, size_t len, unsigned int pad_bits) | |
239 | { | |
240 | bs->buf = s; | |
241 | bs->buf_len = len; | |
242 | bs->pad_bits = pad_bits; | |
243 | bitstream_cursor_reset(&bs->cur, bs->buf); | |
244 | } | |
245 | ||
246 | static inline void bitstream_rewind(struct bitstream *bs) | |
247 | { | |
248 | bitstream_cursor_reset(&bs->cur, bs->buf); | |
249 | memset(bs->buf, 0, bs->buf_len); | |
250 | } | |
251 | ||
252 | /* Put (at most 64) least significant bits of val into bitstream, and advance cursor. | |
253 | * Ignores "pad_bits". | |
254 | * Returns zero if bits == 0 (nothing to do). | |
255 | * Returns number of bits used if successful. | |
256 | * | |
257 | * If there is not enough room left in bitstream, | |
258 | * leaves bitstream unchanged and returns -ENOBUFS. | |
259 | */ | |
260 | static inline int bitstream_put_bits(struct bitstream *bs, u64 val, const unsigned int bits) | |
261 | { | |
262 | unsigned char *b = bs->cur.b; | |
263 | unsigned int tmp; | |
264 | ||
265 | if (bits == 0) | |
266 | return 0; | |
267 | ||
268 | if ((bs->cur.b + ((bs->cur.bit + bits -1) >> 3)) - bs->buf >= bs->buf_len) | |
269 | return -ENOBUFS; | |
270 | ||
271 | /* paranoia: strip off hi bits; they should not be set anyways. */ | |
272 | if (bits < 64) | |
273 | val &= ~0ULL >> (64 - bits); | |
274 | ||
275 | *b++ |= (val & 0xff) << bs->cur.bit; | |
276 | ||
277 | for (tmp = 8 - bs->cur.bit; tmp < bits; tmp += 8) | |
278 | *b++ |= (val >> tmp) & 0xff; | |
279 | ||
280 | bitstream_cursor_advance(&bs->cur, bits); | |
281 | return bits; | |
282 | } | |
283 | ||
284 | /* Fetch (at most 64) bits from bitstream into *out, and advance cursor. | |
285 | * | |
286 | * If more than 64 bits are requested, returns -EINVAL and leave *out unchanged. | |
287 | * | |
288 | * If there are less than the requested number of valid bits left in the | |
289 | * bitstream, still fetches all available bits. | |
290 | * | |
291 | * Returns number of actually fetched bits. | |
292 | */ | |
293 | static inline int bitstream_get_bits(struct bitstream *bs, u64 *out, int bits) | |
294 | { | |
295 | u64 val; | |
296 | unsigned int n; | |
297 | ||
298 | if (bits > 64) | |
299 | return -EINVAL; | |
300 | ||
301 | if (bs->cur.b + ((bs->cur.bit + bs->pad_bits + bits -1) >> 3) - bs->buf >= bs->buf_len) | |
302 | bits = ((bs->buf_len - (bs->cur.b - bs->buf)) << 3) | |
303 | - bs->cur.bit - bs->pad_bits; | |
304 | ||
305 | if (bits == 0) { | |
306 | *out = 0; | |
307 | return 0; | |
308 | } | |
309 | ||
310 | /* get the high bits */ | |
311 | val = 0; | |
312 | n = (bs->cur.bit + bits + 7) >> 3; | |
313 | /* n may be at most 9, if cur.bit + bits > 64 */ | |
314 | /* which means this copies at most 8 byte */ | |
315 | if (n) { | |
316 | memcpy(&val, bs->cur.b+1, n - 1); | |
317 | val = le64_to_cpu(val) << (8 - bs->cur.bit); | |
318 | } | |
319 | ||
320 | /* we still need the low bits */ | |
321 | val |= bs->cur.b[0] >> bs->cur.bit; | |
322 | ||
323 | /* and mask out bits we don't want */ | |
324 | val &= ~0ULL >> (64 - bits); | |
325 | ||
326 | bitstream_cursor_advance(&bs->cur, bits); | |
327 | *out = val; | |
328 | ||
329 | return bits; | |
330 | } | |
331 | ||
332 | /* encodes @in as vli into @bs; | |
333 | ||
334 | * return values | |
335 | * > 0: number of bits successfully stored in bitstream | |
336 | * -ENOBUFS @bs is full | |
337 | * -EINVAL input zero (invalid) | |
338 | * -EOVERFLOW input too large for this vli code (invalid) | |
339 | */ | |
340 | static inline int vli_encode_bits(struct bitstream *bs, u64 in) | |
341 | { | |
342 | u64 code = code; | |
343 | int bits = __vli_encode_bits(&code, in); | |
344 | ||
345 | if (bits <= 0) | |
346 | return bits; | |
347 | ||
348 | return bitstream_put_bits(bs, code, bits); | |
349 | } | |
350 | ||
351 | #endif |