media: vicodec: improve handling of uncompressable planes
[linux-block.git] / drivers / media / platform / vicodec / vicodec-codec.c
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * Copyright 2016 Tom aan de Wiel
4  * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5  *
6  * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
7  *
8  * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9  * R.D. Brown, 1977
10  */
11
12 #include <linux/string.h>
13 #include "vicodec-codec.h"
14
15 #define ALL_ZEROS 15
16
17 static const uint8_t zigzag[64] = {
18         0,
19         1,  8,
20         2,  9, 16,
21         3, 10, 17, 24,
22         4, 11, 18, 25, 32,
23         5, 12, 19, 26, 33, 40,
24         6, 13, 20, 27, 34, 41, 48,
25         7, 14, 21, 28, 35, 42, 49, 56,
26         15, 22, 29, 36, 43, 50, 57,
27         23, 30, 37, 44, 51, 58,
28         31, 38, 45, 52, 59,
29         39, 46, 53, 60,
30         47, 54, 61,
31         55, 62,
32         63,
33 };
34
35
36 static int rlc(const s16 *in, __be16 *output, int blocktype)
37 {
38         s16 block[8 * 8];
39         s16 *wp = block;
40         int i = 0;
41         int x, y;
42         int ret = 0;
43
44         /* read in block from framebuffer */
45         int lastzero_run = 0;
46         int to_encode;
47
48         for (y = 0; y < 8; y++) {
49                 for (x = 0; x < 8; x++) {
50                         *wp = in[x + y * 8];
51                         wp++;
52                 }
53         }
54
55         /* keep track of amount of trailing zeros */
56         for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
57                 lastzero_run++;
58
59         *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
60         ret++;
61
62         to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
63
64         i = 0;
65         while (i < to_encode) {
66                 int cnt = 0;
67                 int tmp;
68
69                 /* count leading zeros */
70                 while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
71                         cnt++;
72                         i++;
73                         if (i == to_encode) {
74                                 cnt--;
75                                 break;
76                         }
77                 }
78                 /* 4 bits for run, 12 for coefficient (quantization by 4) */
79                 *output++ = htons((cnt | tmp << 4));
80                 i++;
81                 ret++;
82         }
83         if (lastzero_run > 14) {
84                 *output = htons(ALL_ZEROS | 0);
85                 ret++;
86         }
87
88         return ret;
89 }
90
91 /*
92  * This function will worst-case increase rlc_in by 65*2 bytes:
93  * one s16 value for the header and 8 * 8 coefficients of type s16.
94  */
95 static s16 derlc(const __be16 **rlc_in, s16 *dwht_out)
96 {
97         /* header */
98         const __be16 *input = *rlc_in;
99         s16 ret = ntohs(*input++);
100         int dec_count = 0;
101         s16 block[8 * 8 + 16];
102         s16 *wp = block;
103         int i;
104
105         /*
106          * Now de-compress, it expands one byte to up to 15 bytes
107          * (or fills the remainder of the 64 bytes with zeroes if it
108          * is the last byte to expand).
109          *
110          * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
111          * allow for overflow if the incoming data was malformed.
112          */
113         while (dec_count < 8 * 8) {
114                 s16 in = ntohs(*input++);
115                 int length = in & 0xf;
116                 int coeff = in >> 4;
117
118                 /* fill remainder with zeros */
119                 if (length == 15) {
120                         for (i = 0; i < 64 - dec_count; i++)
121                                 *wp++ = 0;
122                         break;
123                 }
124
125                 for (i = 0; i < length; i++)
126                         *wp++ = 0;
127                 *wp++ = coeff;
128                 dec_count += length + 1;
129         }
130
131         wp = block;
132
133         for (i = 0; i < 64; i++) {
134                 int pos = zigzag[i];
135                 int y = pos / 8;
136                 int x = pos % 8;
137
138                 dwht_out[x + y * 8] = *wp++;
139         }
140         *rlc_in = input;
141         return ret;
142 }
143
144 static const int quant_table[] = {
145         2, 2, 2, 2, 2, 2,  2,  2,
146         2, 2, 2, 2, 2, 2,  2,  2,
147         2, 2, 2, 2, 2, 2,  2,  3,
148         2, 2, 2, 2, 2, 2,  3,  6,
149         2, 2, 2, 2, 2, 3,  6,  6,
150         2, 2, 2, 2, 3, 6,  6,  6,
151         2, 2, 2, 3, 6, 6,  6,  6,
152         2, 2, 3, 6, 6, 6,  6,  8,
153 };
154
155 static const int quant_table_p[] = {
156         3, 3, 3, 3, 3, 3,  3,  3,
157         3, 3, 3, 3, 3, 3,  3,  3,
158         3, 3, 3, 3, 3, 3,  3,  3,
159         3, 3, 3, 3, 3, 3,  3,  6,
160         3, 3, 3, 3, 3, 3,  6,  6,
161         3, 3, 3, 3, 3, 6,  6,  9,
162         3, 3, 3, 3, 6, 6,  9,  9,
163         3, 3, 3, 6, 6, 9,  9,  10,
164 };
165
166 static void quantize_intra(s16 *coeff, s16 *de_coeff, u16 qp)
167 {
168         const int *quant = quant_table;
169         int i, j;
170
171         for (j = 0; j < 8; j++) {
172                 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
173                         *coeff >>= *quant;
174                         if (*coeff >= -qp && *coeff <= qp)
175                                 *coeff = *de_coeff = 0;
176                         else
177                                 *de_coeff = *coeff << *quant;
178                 }
179         }
180 }
181
182 static void dequantize_intra(s16 *coeff)
183 {
184         const int *quant = quant_table;
185         int i, j;
186
187         for (j = 0; j < 8; j++)
188                 for (i = 0; i < 8; i++, quant++, coeff++)
189                         *coeff <<= *quant;
190 }
191
192 static void quantize_inter(s16 *coeff, s16 *de_coeff, u16 qp)
193 {
194         const int *quant = quant_table_p;
195         int i, j;
196
197         for (j = 0; j < 8; j++) {
198                 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
199                         *coeff >>= *quant;
200                         if (*coeff >= -qp && *coeff <= qp)
201                                 *coeff = *de_coeff = 0;
202                         else
203                                 *de_coeff = *coeff << *quant;
204                 }
205         }
206 }
207
208 static void dequantize_inter(s16 *coeff)
209 {
210         const int *quant = quant_table_p;
211         int i, j;
212
213         for (j = 0; j < 8; j++)
214                 for (i = 0; i < 8; i++, quant++, coeff++)
215                         *coeff <<= *quant;
216 }
217
218 static void fwht(const u8 *block, s16 *output_block, unsigned int stride,
219                  unsigned int input_step, bool intra)
220 {
221         /* we'll need more than 8 bits for the transformed coefficients */
222         s32 workspace1[8], workspace2[8];
223         const u8 *tmp = block;
224         s16 *out = output_block;
225         int add = intra ? 256 : 0;
226         unsigned int i;
227
228         /* stage 1 */
229         stride *= input_step;
230
231         for (i = 0; i < 8; i++, tmp += stride, out += 8) {
232                 switch (input_step) {
233                 case 1:
234                         workspace1[0]  = tmp[0] + tmp[1] - add;
235                         workspace1[1]  = tmp[0] - tmp[1];
236
237                         workspace1[2]  = tmp[2] + tmp[3] - add;
238                         workspace1[3]  = tmp[2] - tmp[3];
239
240                         workspace1[4]  = tmp[4] + tmp[5] - add;
241                         workspace1[5]  = tmp[4] - tmp[5];
242
243                         workspace1[6]  = tmp[6] + tmp[7] - add;
244                         workspace1[7]  = tmp[6] - tmp[7];
245                         break;
246                 case 2:
247                         workspace1[0]  = tmp[0] + tmp[2] - add;
248                         workspace1[1]  = tmp[0] - tmp[2];
249
250                         workspace1[2]  = tmp[4] + tmp[6] - add;
251                         workspace1[3]  = tmp[4] - tmp[6];
252
253                         workspace1[4]  = tmp[8] + tmp[10] - add;
254                         workspace1[5]  = tmp[8] - tmp[10];
255
256                         workspace1[6]  = tmp[12] + tmp[14] - add;
257                         workspace1[7]  = tmp[12] - tmp[14];
258                         break;
259                 case 3:
260                         workspace1[0]  = tmp[0] + tmp[3] - add;
261                         workspace1[1]  = tmp[0] - tmp[3];
262
263                         workspace1[2]  = tmp[6] + tmp[9] - add;
264                         workspace1[3]  = tmp[6] - tmp[9];
265
266                         workspace1[4]  = tmp[12] + tmp[15] - add;
267                         workspace1[5]  = tmp[12] - tmp[15];
268
269                         workspace1[6]  = tmp[18] + tmp[21] - add;
270                         workspace1[7]  = tmp[18] - tmp[21];
271                         break;
272                 default:
273                         workspace1[0]  = tmp[0] + tmp[4] - add;
274                         workspace1[1]  = tmp[0] - tmp[4];
275
276                         workspace1[2]  = tmp[8] + tmp[12] - add;
277                         workspace1[3]  = tmp[8] - tmp[12];
278
279                         workspace1[4]  = tmp[16] + tmp[20] - add;
280                         workspace1[5]  = tmp[16] - tmp[20];
281
282                         workspace1[6]  = tmp[24] + tmp[28] - add;
283                         workspace1[7]  = tmp[24] - tmp[28];
284                         break;
285                 }
286
287                 /* stage 2 */
288                 workspace2[0] = workspace1[0] + workspace1[2];
289                 workspace2[1] = workspace1[0] - workspace1[2];
290                 workspace2[2] = workspace1[1] - workspace1[3];
291                 workspace2[3] = workspace1[1] + workspace1[3];
292
293                 workspace2[4] = workspace1[4] + workspace1[6];
294                 workspace2[5] = workspace1[4] - workspace1[6];
295                 workspace2[6] = workspace1[5] - workspace1[7];
296                 workspace2[7] = workspace1[5] + workspace1[7];
297
298                 /* stage 3 */
299                 out[0] = workspace2[0] + workspace2[4];
300                 out[1] = workspace2[0] - workspace2[4];
301                 out[2] = workspace2[1] - workspace2[5];
302                 out[3] = workspace2[1] + workspace2[5];
303                 out[4] = workspace2[2] + workspace2[6];
304                 out[5] = workspace2[2] - workspace2[6];
305                 out[6] = workspace2[3] - workspace2[7];
306                 out[7] = workspace2[3] + workspace2[7];
307         }
308
309         out = output_block;
310
311         for (i = 0; i < 8; i++, out++) {
312                 /* stage 1 */
313                 workspace1[0]  = out[0] + out[1 * 8];
314                 workspace1[1]  = out[0] - out[1 * 8];
315
316                 workspace1[2]  = out[2 * 8] + out[3 * 8];
317                 workspace1[3]  = out[2 * 8] - out[3 * 8];
318
319                 workspace1[4]  = out[4 * 8] + out[5 * 8];
320                 workspace1[5]  = out[4 * 8] - out[5 * 8];
321
322                 workspace1[6]  = out[6 * 8] + out[7 * 8];
323                 workspace1[7]  = out[6 * 8] - out[7 * 8];
324
325                 /* stage 2 */
326                 workspace2[0] = workspace1[0] + workspace1[2];
327                 workspace2[1] = workspace1[0] - workspace1[2];
328                 workspace2[2] = workspace1[1] - workspace1[3];
329                 workspace2[3] = workspace1[1] + workspace1[3];
330
331                 workspace2[4] = workspace1[4] + workspace1[6];
332                 workspace2[5] = workspace1[4] - workspace1[6];
333                 workspace2[6] = workspace1[5] - workspace1[7];
334                 workspace2[7] = workspace1[5] + workspace1[7];
335                 /* stage 3 */
336                 out[0 * 8] = workspace2[0] + workspace2[4];
337                 out[1 * 8] = workspace2[0] - workspace2[4];
338                 out[2 * 8] = workspace2[1] - workspace2[5];
339                 out[3 * 8] = workspace2[1] + workspace2[5];
340                 out[4 * 8] = workspace2[2] + workspace2[6];
341                 out[5 * 8] = workspace2[2] - workspace2[6];
342                 out[6 * 8] = workspace2[3] - workspace2[7];
343                 out[7 * 8] = workspace2[3] + workspace2[7];
344         }
345 }
346
347 /*
348  * Not the nicest way of doing it, but P-blocks get twice the range of
349  * that of the I-blocks. Therefore we need a type bigger than 8 bits.
350  * Furthermore values can be negative... This is just a version that
351  * works with 16 signed data
352  */
353 static void fwht16(const s16 *block, s16 *output_block, int stride, int intra)
354 {
355         /* we'll need more than 8 bits for the transformed coefficients */
356         s32 workspace1[8], workspace2[8];
357         const s16 *tmp = block;
358         s16 *out = output_block;
359         int i;
360
361         for (i = 0; i < 8; i++, tmp += stride, out += 8) {
362                 /* stage 1 */
363                 workspace1[0]  = tmp[0] + tmp[1];
364                 workspace1[1]  = tmp[0] - tmp[1];
365
366                 workspace1[2]  = tmp[2] + tmp[3];
367                 workspace1[3]  = tmp[2] - tmp[3];
368
369                 workspace1[4]  = tmp[4] + tmp[5];
370                 workspace1[5]  = tmp[4] - tmp[5];
371
372                 workspace1[6]  = tmp[6] + tmp[7];
373                 workspace1[7]  = tmp[6] - tmp[7];
374
375                 /* stage 2 */
376                 workspace2[0] = workspace1[0] + workspace1[2];
377                 workspace2[1] = workspace1[0] - workspace1[2];
378                 workspace2[2] = workspace1[1] - workspace1[3];
379                 workspace2[3] = workspace1[1] + workspace1[3];
380
381                 workspace2[4] = workspace1[4] + workspace1[6];
382                 workspace2[5] = workspace1[4] - workspace1[6];
383                 workspace2[6] = workspace1[5] - workspace1[7];
384                 workspace2[7] = workspace1[5] + workspace1[7];
385
386                 /* stage 3 */
387                 out[0] = workspace2[0] + workspace2[4];
388                 out[1] = workspace2[0] - workspace2[4];
389                 out[2] = workspace2[1] - workspace2[5];
390                 out[3] = workspace2[1] + workspace2[5];
391                 out[4] = workspace2[2] + workspace2[6];
392                 out[5] = workspace2[2] - workspace2[6];
393                 out[6] = workspace2[3] - workspace2[7];
394                 out[7] = workspace2[3] + workspace2[7];
395         }
396
397         out = output_block;
398
399         for (i = 0; i < 8; i++, out++) {
400                 /* stage 1 */
401                 workspace1[0]  = out[0] + out[1*8];
402                 workspace1[1]  = out[0] - out[1*8];
403
404                 workspace1[2]  = out[2*8] + out[3*8];
405                 workspace1[3]  = out[2*8] - out[3*8];
406
407                 workspace1[4]  = out[4*8] + out[5*8];
408                 workspace1[5]  = out[4*8] - out[5*8];
409
410                 workspace1[6]  = out[6*8] + out[7*8];
411                 workspace1[7]  = out[6*8] - out[7*8];
412
413                 /* stage 2 */
414                 workspace2[0] = workspace1[0] + workspace1[2];
415                 workspace2[1] = workspace1[0] - workspace1[2];
416                 workspace2[2] = workspace1[1] - workspace1[3];
417                 workspace2[3] = workspace1[1] + workspace1[3];
418
419                 workspace2[4] = workspace1[4] + workspace1[6];
420                 workspace2[5] = workspace1[4] - workspace1[6];
421                 workspace2[6] = workspace1[5] - workspace1[7];
422                 workspace2[7] = workspace1[5] + workspace1[7];
423
424                 /* stage 3 */
425                 out[0*8] = workspace2[0] + workspace2[4];
426                 out[1*8] = workspace2[0] - workspace2[4];
427                 out[2*8] = workspace2[1] - workspace2[5];
428                 out[3*8] = workspace2[1] + workspace2[5];
429                 out[4*8] = workspace2[2] + workspace2[6];
430                 out[5*8] = workspace2[2] - workspace2[6];
431                 out[6*8] = workspace2[3] - workspace2[7];
432                 out[7*8] = workspace2[3] + workspace2[7];
433         }
434 }
435
436 static void ifwht(const s16 *block, s16 *output_block, int intra)
437 {
438         /*
439          * we'll need more than 8 bits for the transformed coefficients
440          * use native unit of cpu
441          */
442         int workspace1[8], workspace2[8];
443         int inter = intra ? 0 : 1;
444         const s16 *tmp = block;
445         s16 *out = output_block;
446         int i;
447
448         for (i = 0; i < 8; i++, tmp += 8, out += 8) {
449                 /* stage 1 */
450                 workspace1[0]  = tmp[0] + tmp[1];
451                 workspace1[1]  = tmp[0] - tmp[1];
452
453                 workspace1[2]  = tmp[2] + tmp[3];
454                 workspace1[3]  = tmp[2] - tmp[3];
455
456                 workspace1[4]  = tmp[4] + tmp[5];
457                 workspace1[5]  = tmp[4] - tmp[5];
458
459                 workspace1[6]  = tmp[6] + tmp[7];
460                 workspace1[7]  = tmp[6] - tmp[7];
461
462                 /* stage 2 */
463                 workspace2[0] = workspace1[0] + workspace1[2];
464                 workspace2[1] = workspace1[0] - workspace1[2];
465                 workspace2[2] = workspace1[1] - workspace1[3];
466                 workspace2[3] = workspace1[1] + workspace1[3];
467
468                 workspace2[4] = workspace1[4] + workspace1[6];
469                 workspace2[5] = workspace1[4] - workspace1[6];
470                 workspace2[6] = workspace1[5] - workspace1[7];
471                 workspace2[7] = workspace1[5] + workspace1[7];
472
473                 /* stage 3 */
474                 out[0] = workspace2[0] + workspace2[4];
475                 out[1] = workspace2[0] - workspace2[4];
476                 out[2] = workspace2[1] - workspace2[5];
477                 out[3] = workspace2[1] + workspace2[5];
478                 out[4] = workspace2[2] + workspace2[6];
479                 out[5] = workspace2[2] - workspace2[6];
480                 out[6] = workspace2[3] - workspace2[7];
481                 out[7] = workspace2[3] + workspace2[7];
482         }
483
484         out = output_block;
485
486         for (i = 0; i < 8; i++, out++) {
487                 /* stage 1 */
488                 workspace1[0]  = out[0] + out[1 * 8];
489                 workspace1[1]  = out[0] - out[1 * 8];
490
491                 workspace1[2]  = out[2 * 8] + out[3 * 8];
492                 workspace1[3]  = out[2 * 8] - out[3 * 8];
493
494                 workspace1[4]  = out[4 * 8] + out[5 * 8];
495                 workspace1[5]  = out[4 * 8] - out[5 * 8];
496
497                 workspace1[6]  = out[6 * 8] + out[7 * 8];
498                 workspace1[7]  = out[6 * 8] - out[7 * 8];
499
500                 /* stage 2 */
501                 workspace2[0] = workspace1[0] + workspace1[2];
502                 workspace2[1] = workspace1[0] - workspace1[2];
503                 workspace2[2] = workspace1[1] - workspace1[3];
504                 workspace2[3] = workspace1[1] + workspace1[3];
505
506                 workspace2[4] = workspace1[4] + workspace1[6];
507                 workspace2[5] = workspace1[4] - workspace1[6];
508                 workspace2[6] = workspace1[5] - workspace1[7];
509                 workspace2[7] = workspace1[5] + workspace1[7];
510
511                 /* stage 3 */
512                 if (inter) {
513                         int d;
514
515                         out[0 * 8] = workspace2[0] + workspace2[4];
516                         out[1 * 8] = workspace2[0] - workspace2[4];
517                         out[2 * 8] = workspace2[1] - workspace2[5];
518                         out[3 * 8] = workspace2[1] + workspace2[5];
519                         out[4 * 8] = workspace2[2] + workspace2[6];
520                         out[5 * 8] = workspace2[2] - workspace2[6];
521                         out[6 * 8] = workspace2[3] - workspace2[7];
522                         out[7 * 8] = workspace2[3] + workspace2[7];
523
524                         for (d = 0; d < 8; d++)
525                                 out[8 * d] >>= 6;
526                 } else {
527                         int d;
528
529                         out[0 * 8] = workspace2[0] + workspace2[4];
530                         out[1 * 8] = workspace2[0] - workspace2[4];
531                         out[2 * 8] = workspace2[1] - workspace2[5];
532                         out[3 * 8] = workspace2[1] + workspace2[5];
533                         out[4 * 8] = workspace2[2] + workspace2[6];
534                         out[5 * 8] = workspace2[2] - workspace2[6];
535                         out[6 * 8] = workspace2[3] - workspace2[7];
536                         out[7 * 8] = workspace2[3] + workspace2[7];
537
538                         for (d = 0; d < 8; d++) {
539                                 out[8 * d] >>= 6;
540                                 out[8 * d] += 128;
541                         }
542                 }
543         }
544 }
545
546 static void fill_encoder_block(const u8 *input, s16 *dst,
547                                unsigned int stride, unsigned int input_step)
548 {
549         int i, j;
550
551         for (i = 0; i < 8; i++) {
552                 for (j = 0; j < 8; j++, input += input_step)
553                         *dst++ = *input;
554                 input += (stride - 8) * input_step;
555         }
556 }
557
558 static int var_intra(const s16 *input)
559 {
560         int32_t mean = 0;
561         int32_t ret = 0;
562         const s16 *tmp = input;
563         int i;
564
565         for (i = 0; i < 8 * 8; i++, tmp++)
566                 mean += *tmp;
567         mean /= 64;
568         tmp = input;
569         for (i = 0; i < 8 * 8; i++, tmp++)
570                 ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
571         return ret;
572 }
573
574 static int var_inter(const s16 *old, const s16 *new)
575 {
576         int32_t ret = 0;
577         int i;
578
579         for (i = 0; i < 8 * 8; i++, old++, new++)
580                 ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
581         return ret;
582 }
583
584 static int decide_blocktype(const u8 *cur, const u8 *reference,
585                             s16 *deltablock, unsigned int stride,
586                             unsigned int input_step)
587 {
588         s16 tmp[64];
589         s16 old[64];
590         s16 *work = tmp;
591         unsigned int k, l;
592         int vari;
593         int vard;
594
595         fill_encoder_block(cur, tmp, stride, input_step);
596         fill_encoder_block(reference, old, 8, 1);
597         vari = var_intra(tmp);
598
599         for (k = 0; k < 8; k++) {
600                 for (l = 0; l < 8; l++) {
601                         *deltablock = *work - *reference;
602                         deltablock++;
603                         work++;
604                         reference++;
605                 }
606         }
607         deltablock -= 64;
608         vard = var_inter(old, tmp);
609         return vari <= vard ? IBLOCK : PBLOCK;
610 }
611
612 static void fill_decoder_block(u8 *dst, const s16 *input, int stride)
613 {
614         int i, j;
615
616         for (i = 0; i < 8; i++) {
617                 for (j = 0; j < 8; j++)
618                         *dst++ = *input++;
619                 dst += stride - 8;
620         }
621 }
622
623 static void add_deltas(s16 *deltas, const u8 *ref, int stride)
624 {
625         int k, l;
626
627         for (k = 0; k < 8; k++) {
628                 for (l = 0; l < 8; l++) {
629                         *deltas += *ref++;
630                         /*
631                          * Due to quantizing, it might possible that the
632                          * decoded coefficients are slightly out of range
633                          */
634                         if (*deltas < 0)
635                                 *deltas = 0;
636                         else if (*deltas > 255)
637                                 *deltas = 255;
638                         deltas++;
639                 }
640                 ref += stride - 8;
641         }
642 }
643
644 static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
645                         struct cframe *cf, u32 height, u32 width,
646                         unsigned int input_step,
647                         bool is_intra, bool next_is_intra)
648 {
649         u8 *input_start = input;
650         __be16 *rlco_start = *rlco;
651         s16 deltablock[64];
652         __be16 pframe_bit = htons(PFRAME_BIT);
653         u32 encoding = 0;
654         unsigned int last_size = 0;
655         unsigned int i, j;
656
657         for (j = 0; j < height / 8; j++) {
658                 for (i = 0; i < width / 8; i++) {
659                         /* intra code, first frame is always intra coded. */
660                         int blocktype = IBLOCK;
661                         unsigned int size;
662
663                         if (!is_intra)
664                                 blocktype = decide_blocktype(input, refp,
665                                         deltablock, width, input_step);
666                         if (blocktype == IBLOCK) {
667                                 fwht(input, cf->coeffs, width, input_step, 1);
668                                 quantize_intra(cf->coeffs, cf->de_coeffs,
669                                                cf->i_frame_qp);
670                         } else {
671                                 /* inter code */
672                                 encoding |= FRAME_PCODED;
673                                 fwht16(deltablock, cf->coeffs, 8, 0);
674                                 quantize_inter(cf->coeffs, cf->de_coeffs,
675                                                cf->p_frame_qp);
676                         }
677                         if (!next_is_intra) {
678                                 ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
679
680                                 if (blocktype == PBLOCK)
681                                         add_deltas(cf->de_fwht, refp, 8);
682                                 fill_decoder_block(refp, cf->de_fwht, 8);
683                         }
684
685                         input += 8 * input_step;
686                         refp += 8 * 8;
687
688                         size = rlc(cf->coeffs, *rlco, blocktype);
689                         if (last_size == size &&
690                             !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
691                                 __be16 *last_rlco = *rlco - size;
692                                 s16 hdr = ntohs(*last_rlco);
693
694                                 if (!((*last_rlco ^ **rlco) & pframe_bit) &&
695                                     (hdr & DUPS_MASK) < DUPS_MASK)
696                                         *last_rlco = htons(hdr + 2);
697                                 else
698                                         *rlco += size;
699                         } else {
700                                 *rlco += size;
701                         }
702                         if (*rlco >= rlco_max) {
703                                 encoding |= FRAME_UNENCODED;
704                                 goto exit_loop;
705                         }
706                         last_size = size;
707                 }
708                 input += width * 7 * input_step;
709         }
710
711 exit_loop:
712         if (encoding & FRAME_UNENCODED) {
713                 u8 *out = (u8 *)rlco_start;
714
715                 input = input_start;
716                 /*
717                  * The compressed stream should never contain the magic
718                  * header, so when we copy the YUV data we replace 0xff
719                  * by 0xfe. Since YUV is limited range such values
720                  * shouldn't appear anyway.
721                  */
722                 for (i = 0; i < height * width; i++, input += input_step)
723                         *out++ = (*input == 0xff) ? 0xfe : *input;
724                 *rlco = (__be16 *)out;
725                 encoding &= ~FRAME_PCODED;
726         }
727         return encoding;
728 }
729
730 u32 encode_frame(struct raw_frame *frm, struct raw_frame *ref_frm,
731                  struct cframe *cf, bool is_intra, bool next_is_intra)
732 {
733         unsigned int size = frm->height * frm->width;
734         __be16 *rlco = cf->rlc_data;
735         __be16 *rlco_max;
736         u32 encoding;
737         u32 chroma_h = frm->height / frm->height_div;
738         u32 chroma_w = frm->width / frm->width_div;
739         unsigned int chroma_size = chroma_h * chroma_w;
740
741         rlco_max = rlco + size / 2 - 256;
742         encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
743                                 frm->height, frm->width,
744                                 frm->luma_step, is_intra, next_is_intra);
745         if (encoding & FRAME_UNENCODED)
746                 encoding |= LUMA_UNENCODED;
747         encoding &= ~FRAME_UNENCODED;
748         rlco_max = rlco + chroma_size / 2 - 256;
749         encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max, cf,
750                                  chroma_h, chroma_w,
751                                  frm->chroma_step, is_intra, next_is_intra);
752         if (encoding & FRAME_UNENCODED)
753                 encoding |= CB_UNENCODED;
754         encoding &= ~FRAME_UNENCODED;
755         rlco_max = rlco + chroma_size / 2 - 256;
756         encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max, cf,
757                                  chroma_h, chroma_w,
758                                  frm->chroma_step, is_intra, next_is_intra);
759         if (encoding & FRAME_UNENCODED)
760                 encoding |= CR_UNENCODED;
761         encoding &= ~FRAME_UNENCODED;
762         cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
763         return encoding;
764 }
765
766 static void decode_plane(struct cframe *cf, const __be16 **rlco, u8 *ref,
767                          u32 height, u32 width, bool uncompressed)
768 {
769         unsigned int copies = 0;
770         s16 copy[8 * 8];
771         s16 stat;
772         unsigned int i, j;
773
774         if (uncompressed) {
775                 memcpy(ref, *rlco, width * height);
776                 *rlco += width * height / 2;
777                 return;
778         }
779
780         /*
781          * When decoding each macroblock the rlco pointer will be increased
782          * by 65 * 2 bytes worst-case.
783          * To avoid overflow the buffer has to be 65/64th of the actual raw
784          * image size, just in case someone feeds it malicious data.
785          */
786         for (j = 0; j < height / 8; j++) {
787                 for (i = 0; i < width / 8; i++) {
788                         u8 *refp = ref + j * 8 * width + i * 8;
789
790                         if (copies) {
791                                 memcpy(cf->de_fwht, copy, sizeof(copy));
792                                 if (stat & PFRAME_BIT)
793                                         add_deltas(cf->de_fwht, refp, width);
794                                 fill_decoder_block(refp, cf->de_fwht, width);
795                                 copies--;
796                                 continue;
797                         }
798
799                         stat = derlc(rlco, cf->coeffs);
800
801                         if (stat & PFRAME_BIT)
802                                 dequantize_inter(cf->coeffs);
803                         else
804                                 dequantize_intra(cf->coeffs);
805
806                         ifwht(cf->coeffs, cf->de_fwht,
807                               (stat & PFRAME_BIT) ? 0 : 1);
808
809                         copies = (stat & DUPS_MASK) >> 1;
810                         if (copies)
811                                 memcpy(copy, cf->de_fwht, sizeof(copy));
812                         if (stat & PFRAME_BIT)
813                                 add_deltas(cf->de_fwht, refp, width);
814                         fill_decoder_block(refp, cf->de_fwht, width);
815                 }
816         }
817 }
818
819 void decode_frame(struct cframe *cf, struct raw_frame *ref, u32 hdr_flags)
820 {
821         const __be16 *rlco = cf->rlc_data;
822         u32 h = cf->height / 2;
823         u32 w = cf->width / 2;
824
825         if (hdr_flags & VICODEC_FL_CHROMA_FULL_HEIGHT)
826                 h *= 2;
827         if (hdr_flags & VICODEC_FL_CHROMA_FULL_WIDTH)
828                 w *= 2;
829         decode_plane(cf, &rlco, ref->luma, cf->height, cf->width,
830                      hdr_flags & VICODEC_FL_LUMA_IS_UNCOMPRESSED);
831         decode_plane(cf, &rlco, ref->cb, h, w,
832                      hdr_flags & VICODEC_FL_CB_IS_UNCOMPRESSED);
833         decode_plane(cf, &rlco, ref->cr, h, w,
834                      hdr_flags & VICODEC_FL_CR_IS_UNCOMPRESSED);
835 }