Merge tag 'nfs-for-4.19-1' of git://git.linux-nfs.org/projects/anna/linux-nfs
[linux-block.git] / drivers / media / platform / vicodec / vicodec-codec.c
1 // SPDX-License-Identifier: GPL-2.0+
2 /*
3  * Copyright 2016 Tom aan de Wiel
4  * Copyright 2018 Cisco Systems, Inc. and/or its affiliates. All rights reserved.
5  *
6  * 8x8 Fast Walsh Hadamard Transform in sequency order based on the paper:
7  *
8  * A Recursive Algorithm for Sequency-Ordered Fast Walsh Transforms,
9  * R.D. Brown, 1977
10  */
11
12 #include <linux/string.h>
13 #include "vicodec-codec.h"
14
15 #define ALL_ZEROS 15
16 #define DEADZONE_WIDTH 20
17
18 static const uint8_t zigzag[64] = {
19         0,
20         1,  8,
21         2,  9, 16,
22         3, 10, 17, 24,
23         4, 11, 18, 25, 32,
24         5, 12, 19, 26, 33, 40,
25         6, 13, 20, 27, 34, 41, 48,
26         7, 14, 21, 28, 35, 42, 49, 56,
27         15, 22, 29, 36, 43, 50, 57,
28         23, 30, 37, 44, 51, 58,
29         31, 38, 45, 52, 59,
30         39, 46, 53, 60,
31         47, 54, 61,
32         55, 62,
33         63,
34 };
35
36
37 static int rlc(const s16 *in, __be16 *output, int blocktype)
38 {
39         s16 block[8 * 8];
40         s16 *wp = block;
41         int i = 0;
42         int x, y;
43         int ret = 0;
44
45         /* read in block from framebuffer */
46         int lastzero_run = 0;
47         int to_encode;
48
49         for (y = 0; y < 8; y++) {
50                 for (x = 0; x < 8; x++) {
51                         *wp = in[x + y * 8];
52                         wp++;
53                 }
54         }
55
56         /* keep track of amount of trailing zeros */
57         for (i = 63; i >= 0 && !block[zigzag[i]]; i--)
58                 lastzero_run++;
59
60         *output++ = (blocktype == PBLOCK ? htons(PFRAME_BIT) : 0);
61         ret++;
62
63         to_encode = 8 * 8 - (lastzero_run > 14 ? lastzero_run : 0);
64
65         i = 0;
66         while (i < to_encode) {
67                 int cnt = 0;
68                 int tmp;
69
70                 /* count leading zeros */
71                 while ((tmp = block[zigzag[i]]) == 0 && cnt < 14) {
72                         cnt++;
73                         i++;
74                         if (i == to_encode) {
75                                 cnt--;
76                                 break;
77                         }
78                 }
79                 /* 4 bits for run, 12 for coefficient (quantization by 4) */
80                 *output++ = htons((cnt | tmp << 4));
81                 i++;
82                 ret++;
83         }
84         if (lastzero_run > 14) {
85                 *output = htons(ALL_ZEROS | 0);
86                 ret++;
87         }
88
89         return ret;
90 }
91
92 /*
93  * This function will worst-case increase rlc_in by 65*2 bytes:
94  * one s16 value for the header and 8 * 8 coefficients of type s16.
95  */
96 static s16 derlc(const __be16 **rlc_in, s16 *dwht_out)
97 {
98         /* header */
99         const __be16 *input = *rlc_in;
100         s16 ret = ntohs(*input++);
101         int dec_count = 0;
102         s16 block[8 * 8 + 16];
103         s16 *wp = block;
104         int i;
105
106         /*
107          * Now de-compress, it expands one byte to up to 15 bytes
108          * (or fills the remainder of the 64 bytes with zeroes if it
109          * is the last byte to expand).
110          *
111          * So block has to be 8 * 8 + 16 bytes, the '+ 16' is to
112          * allow for overflow if the incoming data was malformed.
113          */
114         while (dec_count < 8 * 8) {
115                 s16 in = ntohs(*input++);
116                 int length = in & 0xf;
117                 int coeff = in >> 4;
118
119                 /* fill remainder with zeros */
120                 if (length == 15) {
121                         for (i = 0; i < 64 - dec_count; i++)
122                                 *wp++ = 0;
123                         break;
124                 }
125
126                 for (i = 0; i < length; i++)
127                         *wp++ = 0;
128                 *wp++ = coeff;
129                 dec_count += length + 1;
130         }
131
132         wp = block;
133
134         for (i = 0; i < 64; i++) {
135                 int pos = zigzag[i];
136                 int y = pos / 8;
137                 int x = pos % 8;
138
139                 dwht_out[x + y * 8] = *wp++;
140         }
141         *rlc_in = input;
142         return ret;
143 }
144
145 static const int quant_table[] = {
146         2, 2, 2, 2, 2, 2,  2,  2,
147         2, 2, 2, 2, 2, 2,  2,  2,
148         2, 2, 2, 2, 2, 2,  2,  3,
149         2, 2, 2, 2, 2, 2,  3,  6,
150         2, 2, 2, 2, 2, 3,  6,  6,
151         2, 2, 2, 2, 3, 6,  6,  6,
152         2, 2, 2, 3, 6, 6,  6,  6,
153         2, 2, 3, 6, 6, 6,  6,  8,
154 };
155
156 static const int quant_table_p[] = {
157         3, 3, 3, 3, 3, 3,  3,  3,
158         3, 3, 3, 3, 3, 3,  3,  3,
159         3, 3, 3, 3, 3, 3,  3,  3,
160         3, 3, 3, 3, 3, 3,  3,  6,
161         3, 3, 3, 3, 3, 3,  6,  6,
162         3, 3, 3, 3, 3, 6,  6,  9,
163         3, 3, 3, 3, 6, 6,  9,  9,
164         3, 3, 3, 6, 6, 9,  9,  10,
165 };
166
167 static void quantize_intra(s16 *coeff, s16 *de_coeff)
168 {
169         const int *quant = quant_table;
170         int i, j;
171
172         for (j = 0; j < 8; j++) {
173                 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
174                         *coeff >>= *quant;
175                         if (*coeff >= -DEADZONE_WIDTH &&
176                             *coeff <= DEADZONE_WIDTH)
177                                 *coeff = *de_coeff = 0;
178                         else
179                                 *de_coeff = *coeff << *quant;
180                 }
181         }
182 }
183
184 static void dequantize_intra(s16 *coeff)
185 {
186         const int *quant = quant_table;
187         int i, j;
188
189         for (j = 0; j < 8; j++)
190                 for (i = 0; i < 8; i++, quant++, coeff++)
191                         *coeff <<= *quant;
192 }
193
194 static void quantize_inter(s16 *coeff, s16 *de_coeff)
195 {
196         const int *quant = quant_table_p;
197         int i, j;
198
199         for (j = 0; j < 8; j++) {
200                 for (i = 0; i < 8; i++, quant++, coeff++, de_coeff++) {
201                         *coeff >>= *quant;
202                         if (*coeff >= -DEADZONE_WIDTH &&
203                             *coeff <= DEADZONE_WIDTH)
204                                 *coeff = *de_coeff = 0;
205                         else
206                                 *de_coeff = *coeff << *quant;
207                 }
208         }
209 }
210
211 static void dequantize_inter(s16 *coeff)
212 {
213         const int *quant = quant_table_p;
214         int i, j;
215
216         for (j = 0; j < 8; j++)
217                 for (i = 0; i < 8; i++, quant++, coeff++)
218                         *coeff <<= *quant;
219 }
220
221 static void fwht(const u8 *block, s16 *output_block, unsigned int stride,
222                  unsigned int input_step, bool intra)
223 {
224         /* we'll need more than 8 bits for the transformed coefficients */
225         s32 workspace1[8], workspace2[8];
226         const u8 *tmp = block;
227         s16 *out = output_block;
228         int add = intra ? 256 : 0;
229         unsigned int i;
230
231         /* stage 1 */
232         stride *= input_step;
233
234         for (i = 0; i < 8; i++, tmp += stride, out += 8) {
235                 if (input_step == 1) {
236                         workspace1[0]  = tmp[0] + tmp[1] - add;
237                         workspace1[1]  = tmp[0] - tmp[1];
238
239                         workspace1[2]  = tmp[2] + tmp[3] - add;
240                         workspace1[3]  = tmp[2] - tmp[3];
241
242                         workspace1[4]  = tmp[4] + tmp[5] - add;
243                         workspace1[5]  = tmp[4] - tmp[5];
244
245                         workspace1[6]  = tmp[6] + tmp[7] - add;
246                         workspace1[7]  = tmp[6] - tmp[7];
247                 } else {
248                         workspace1[0]  = tmp[0] + tmp[2] - add;
249                         workspace1[1]  = tmp[0] - tmp[2];
250
251                         workspace1[2]  = tmp[4] + tmp[6] - add;
252                         workspace1[3]  = tmp[4] - tmp[6];
253
254                         workspace1[4]  = tmp[8] + tmp[10] - add;
255                         workspace1[5]  = tmp[8] - tmp[10];
256
257                         workspace1[6]  = tmp[12] + tmp[14] - add;
258                         workspace1[7]  = tmp[12] - tmp[14];
259                 }
260
261                 /* stage 2 */
262                 workspace2[0] = workspace1[0] + workspace1[2];
263                 workspace2[1] = workspace1[0] - workspace1[2];
264                 workspace2[2] = workspace1[1] - workspace1[3];
265                 workspace2[3] = workspace1[1] + workspace1[3];
266
267                 workspace2[4] = workspace1[4] + workspace1[6];
268                 workspace2[5] = workspace1[4] - workspace1[6];
269                 workspace2[6] = workspace1[5] - workspace1[7];
270                 workspace2[7] = workspace1[5] + workspace1[7];
271
272                 /* stage 3 */
273                 out[0] = workspace2[0] + workspace2[4];
274                 out[1] = workspace2[0] - workspace2[4];
275                 out[2] = workspace2[1] - workspace2[5];
276                 out[3] = workspace2[1] + workspace2[5];
277                 out[4] = workspace2[2] + workspace2[6];
278                 out[5] = workspace2[2] - workspace2[6];
279                 out[6] = workspace2[3] - workspace2[7];
280                 out[7] = workspace2[3] + workspace2[7];
281         }
282
283         out = output_block;
284
285         for (i = 0; i < 8; i++, out++) {
286                 /* stage 1 */
287                 workspace1[0]  = out[0] + out[1 * 8];
288                 workspace1[1]  = out[0] - out[1 * 8];
289
290                 workspace1[2]  = out[2 * 8] + out[3 * 8];
291                 workspace1[3]  = out[2 * 8] - out[3 * 8];
292
293                 workspace1[4]  = out[4 * 8] + out[5 * 8];
294                 workspace1[5]  = out[4 * 8] - out[5 * 8];
295
296                 workspace1[6]  = out[6 * 8] + out[7 * 8];
297                 workspace1[7]  = out[6 * 8] - out[7 * 8];
298
299                 /* stage 2 */
300                 workspace2[0] = workspace1[0] + workspace1[2];
301                 workspace2[1] = workspace1[0] - workspace1[2];
302                 workspace2[2] = workspace1[1] - workspace1[3];
303                 workspace2[3] = workspace1[1] + workspace1[3];
304
305                 workspace2[4] = workspace1[4] + workspace1[6];
306                 workspace2[5] = workspace1[4] - workspace1[6];
307                 workspace2[6] = workspace1[5] - workspace1[7];
308                 workspace2[7] = workspace1[5] + workspace1[7];
309                 /* stage 3 */
310                 out[0 * 8] = workspace2[0] + workspace2[4];
311                 out[1 * 8] = workspace2[0] - workspace2[4];
312                 out[2 * 8] = workspace2[1] - workspace2[5];
313                 out[3 * 8] = workspace2[1] + workspace2[5];
314                 out[4 * 8] = workspace2[2] + workspace2[6];
315                 out[5 * 8] = workspace2[2] - workspace2[6];
316                 out[6 * 8] = workspace2[3] - workspace2[7];
317                 out[7 * 8] = workspace2[3] + workspace2[7];
318         }
319 }
320
321 /*
322  * Not the nicest way of doing it, but P-blocks get twice the range of
323  * that of the I-blocks. Therefore we need a type bigger than 8 bits.
324  * Furthermore values can be negative... This is just a version that
325  * works with 16 signed data
326  */
327 static void fwht16(const s16 *block, s16 *output_block, int stride, int intra)
328 {
329         /* we'll need more than 8 bits for the transformed coefficients */
330         s32 workspace1[8], workspace2[8];
331         const s16 *tmp = block;
332         s16 *out = output_block;
333         int i;
334
335         for (i = 0; i < 8; i++, tmp += stride, out += 8) {
336                 /* stage 1 */
337                 workspace1[0]  = tmp[0] + tmp[1];
338                 workspace1[1]  = tmp[0] - tmp[1];
339
340                 workspace1[2]  = tmp[2] + tmp[3];
341                 workspace1[3]  = tmp[2] - tmp[3];
342
343                 workspace1[4]  = tmp[4] + tmp[5];
344                 workspace1[5]  = tmp[4] - tmp[5];
345
346                 workspace1[6]  = tmp[6] + tmp[7];
347                 workspace1[7]  = tmp[6] - tmp[7];
348
349                 /* stage 2 */
350                 workspace2[0] = workspace1[0] + workspace1[2];
351                 workspace2[1] = workspace1[0] - workspace1[2];
352                 workspace2[2] = workspace1[1] - workspace1[3];
353                 workspace2[3] = workspace1[1] + workspace1[3];
354
355                 workspace2[4] = workspace1[4] + workspace1[6];
356                 workspace2[5] = workspace1[4] - workspace1[6];
357                 workspace2[6] = workspace1[5] - workspace1[7];
358                 workspace2[7] = workspace1[5] + workspace1[7];
359
360                 /* stage 3 */
361                 out[0] = workspace2[0] + workspace2[4];
362                 out[1] = workspace2[0] - workspace2[4];
363                 out[2] = workspace2[1] - workspace2[5];
364                 out[3] = workspace2[1] + workspace2[5];
365                 out[4] = workspace2[2] + workspace2[6];
366                 out[5] = workspace2[2] - workspace2[6];
367                 out[6] = workspace2[3] - workspace2[7];
368                 out[7] = workspace2[3] + workspace2[7];
369         }
370
371         out = output_block;
372
373         for (i = 0; i < 8; i++, out++) {
374                 /* stage 1 */
375                 workspace1[0]  = out[0] + out[1*8];
376                 workspace1[1]  = out[0] - out[1*8];
377
378                 workspace1[2]  = out[2*8] + out[3*8];
379                 workspace1[3]  = out[2*8] - out[3*8];
380
381                 workspace1[4]  = out[4*8] + out[5*8];
382                 workspace1[5]  = out[4*8] - out[5*8];
383
384                 workspace1[6]  = out[6*8] + out[7*8];
385                 workspace1[7]  = out[6*8] - out[7*8];
386
387                 /* stage 2 */
388                 workspace2[0] = workspace1[0] + workspace1[2];
389                 workspace2[1] = workspace1[0] - workspace1[2];
390                 workspace2[2] = workspace1[1] - workspace1[3];
391                 workspace2[3] = workspace1[1] + workspace1[3];
392
393                 workspace2[4] = workspace1[4] + workspace1[6];
394                 workspace2[5] = workspace1[4] - workspace1[6];
395                 workspace2[6] = workspace1[5] - workspace1[7];
396                 workspace2[7] = workspace1[5] + workspace1[7];
397
398                 /* stage 3 */
399                 out[0*8] = workspace2[0] + workspace2[4];
400                 out[1*8] = workspace2[0] - workspace2[4];
401                 out[2*8] = workspace2[1] - workspace2[5];
402                 out[3*8] = workspace2[1] + workspace2[5];
403                 out[4*8] = workspace2[2] + workspace2[6];
404                 out[5*8] = workspace2[2] - workspace2[6];
405                 out[6*8] = workspace2[3] - workspace2[7];
406                 out[7*8] = workspace2[3] + workspace2[7];
407         }
408 }
409
410 static void ifwht(const s16 *block, s16 *output_block, int intra)
411 {
412         /*
413          * we'll need more than 8 bits for the transformed coefficients
414          * use native unit of cpu
415          */
416         int workspace1[8], workspace2[8];
417         int inter = intra ? 0 : 1;
418         const s16 *tmp = block;
419         s16 *out = output_block;
420         int i;
421
422         for (i = 0; i < 8; i++, tmp += 8, out += 8) {
423                 /* stage 1 */
424                 workspace1[0]  = tmp[0] + tmp[1];
425                 workspace1[1]  = tmp[0] - tmp[1];
426
427                 workspace1[2]  = tmp[2] + tmp[3];
428                 workspace1[3]  = tmp[2] - tmp[3];
429
430                 workspace1[4]  = tmp[4] + tmp[5];
431                 workspace1[5]  = tmp[4] - tmp[5];
432
433                 workspace1[6]  = tmp[6] + tmp[7];
434                 workspace1[7]  = tmp[6] - tmp[7];
435
436                 /* stage 2 */
437                 workspace2[0] = workspace1[0] + workspace1[2];
438                 workspace2[1] = workspace1[0] - workspace1[2];
439                 workspace2[2] = workspace1[1] - workspace1[3];
440                 workspace2[3] = workspace1[1] + workspace1[3];
441
442                 workspace2[4] = workspace1[4] + workspace1[6];
443                 workspace2[5] = workspace1[4] - workspace1[6];
444                 workspace2[6] = workspace1[5] - workspace1[7];
445                 workspace2[7] = workspace1[5] + workspace1[7];
446
447                 /* stage 3 */
448                 out[0] = workspace2[0] + workspace2[4];
449                 out[1] = workspace2[0] - workspace2[4];
450                 out[2] = workspace2[1] - workspace2[5];
451                 out[3] = workspace2[1] + workspace2[5];
452                 out[4] = workspace2[2] + workspace2[6];
453                 out[5] = workspace2[2] - workspace2[6];
454                 out[6] = workspace2[3] - workspace2[7];
455                 out[7] = workspace2[3] + workspace2[7];
456         }
457
458         out = output_block;
459
460         for (i = 0; i < 8; i++, out++) {
461                 /* stage 1 */
462                 workspace1[0]  = out[0] + out[1 * 8];
463                 workspace1[1]  = out[0] - out[1 * 8];
464
465                 workspace1[2]  = out[2 * 8] + out[3 * 8];
466                 workspace1[3]  = out[2 * 8] - out[3 * 8];
467
468                 workspace1[4]  = out[4 * 8] + out[5 * 8];
469                 workspace1[5]  = out[4 * 8] - out[5 * 8];
470
471                 workspace1[6]  = out[6 * 8] + out[7 * 8];
472                 workspace1[7]  = out[6 * 8] - out[7 * 8];
473
474                 /* stage 2 */
475                 workspace2[0] = workspace1[0] + workspace1[2];
476                 workspace2[1] = workspace1[0] - workspace1[2];
477                 workspace2[2] = workspace1[1] - workspace1[3];
478                 workspace2[3] = workspace1[1] + workspace1[3];
479
480                 workspace2[4] = workspace1[4] + workspace1[6];
481                 workspace2[5] = workspace1[4] - workspace1[6];
482                 workspace2[6] = workspace1[5] - workspace1[7];
483                 workspace2[7] = workspace1[5] + workspace1[7];
484
485                 /* stage 3 */
486                 if (inter) {
487                         int d;
488
489                         out[0 * 8] = workspace2[0] + workspace2[4];
490                         out[1 * 8] = workspace2[0] - workspace2[4];
491                         out[2 * 8] = workspace2[1] - workspace2[5];
492                         out[3 * 8] = workspace2[1] + workspace2[5];
493                         out[4 * 8] = workspace2[2] + workspace2[6];
494                         out[5 * 8] = workspace2[2] - workspace2[6];
495                         out[6 * 8] = workspace2[3] - workspace2[7];
496                         out[7 * 8] = workspace2[3] + workspace2[7];
497
498                         for (d = 0; d < 8; d++)
499                                 out[8 * d] >>= 6;
500                 } else {
501                         int d;
502
503                         out[0 * 8] = workspace2[0] + workspace2[4];
504                         out[1 * 8] = workspace2[0] - workspace2[4];
505                         out[2 * 8] = workspace2[1] - workspace2[5];
506                         out[3 * 8] = workspace2[1] + workspace2[5];
507                         out[4 * 8] = workspace2[2] + workspace2[6];
508                         out[5 * 8] = workspace2[2] - workspace2[6];
509                         out[6 * 8] = workspace2[3] - workspace2[7];
510                         out[7 * 8] = workspace2[3] + workspace2[7];
511
512                         for (d = 0; d < 8; d++) {
513                                 out[8 * d] >>= 6;
514                                 out[8 * d] += 128;
515                         }
516                 }
517         }
518 }
519
520 static void fill_encoder_block(const u8 *input, s16 *dst,
521                                unsigned int stride, unsigned int input_step)
522 {
523         int i, j;
524
525         for (i = 0; i < 8; i++) {
526                 for (j = 0; j < 8; j++, input += input_step)
527                         *dst++ = *input;
528                 input += (stride - 8) * input_step;
529         }
530 }
531
532 static int var_intra(const s16 *input)
533 {
534         int32_t mean = 0;
535         int32_t ret = 0;
536         const s16 *tmp = input;
537         int i;
538
539         for (i = 0; i < 8 * 8; i++, tmp++)
540                 mean += *tmp;
541         mean /= 64;
542         tmp = input;
543         for (i = 0; i < 8 * 8; i++, tmp++)
544                 ret += (*tmp - mean) < 0 ? -(*tmp - mean) : (*tmp - mean);
545         return ret;
546 }
547
548 static int var_inter(const s16 *old, const s16 *new)
549 {
550         int32_t ret = 0;
551         int i;
552
553         for (i = 0; i < 8 * 8; i++, old++, new++)
554                 ret += (*old - *new) < 0 ? -(*old - *new) : (*old - *new);
555         return ret;
556 }
557
558 static int decide_blocktype(const u8 *cur, const u8 *reference,
559                             s16 *deltablock, unsigned int stride,
560                             unsigned int input_step)
561 {
562         s16 tmp[64];
563         s16 old[64];
564         s16 *work = tmp;
565         unsigned int k, l;
566         int vari;
567         int vard;
568
569         fill_encoder_block(cur, tmp, stride, input_step);
570         fill_encoder_block(reference, old, 8, 1);
571         vari = var_intra(tmp);
572
573         for (k = 0; k < 8; k++) {
574                 for (l = 0; l < 8; l++) {
575                         *deltablock = *work - *reference;
576                         deltablock++;
577                         work++;
578                         reference++;
579                 }
580         }
581         deltablock -= 64;
582         vard = var_inter(old, tmp);
583         return vari <= vard ? IBLOCK : PBLOCK;
584 }
585
586 static void fill_decoder_block(u8 *dst, const s16 *input, int stride)
587 {
588         int i, j;
589
590         for (i = 0; i < 8; i++) {
591                 for (j = 0; j < 8; j++)
592                         *dst++ = *input++;
593                 dst += stride - 8;
594         }
595 }
596
597 static void add_deltas(s16 *deltas, const u8 *ref, int stride)
598 {
599         int k, l;
600
601         for (k = 0; k < 8; k++) {
602                 for (l = 0; l < 8; l++) {
603                         *deltas += *ref++;
604                         /*
605                          * Due to quantizing, it might possible that the
606                          * decoded coefficients are slightly out of range
607                          */
608                         if (*deltas < 0)
609                                 *deltas = 0;
610                         else if (*deltas > 255)
611                                 *deltas = 255;
612                         deltas++;
613                 }
614                 ref += stride - 8;
615         }
616 }
617
618 static u32 encode_plane(u8 *input, u8 *refp, __be16 **rlco, __be16 *rlco_max,
619                         struct cframe *cf, u32 height, u32 width,
620                         unsigned int input_step,
621                         bool is_intra, bool next_is_intra)
622 {
623         u8 *input_start = input;
624         __be16 *rlco_start = *rlco;
625         s16 deltablock[64];
626         __be16 pframe_bit = htons(PFRAME_BIT);
627         u32 encoding = 0;
628         unsigned int last_size = 0;
629         unsigned int i, j;
630
631         for (j = 0; j < height / 8; j++) {
632                 for (i = 0; i < width / 8; i++) {
633                         /* intra code, first frame is always intra coded. */
634                         int blocktype = IBLOCK;
635                         unsigned int size;
636
637                         if (!is_intra)
638                                 blocktype = decide_blocktype(input, refp,
639                                         deltablock, width, input_step);
640                         if (is_intra || blocktype == IBLOCK) {
641                                 fwht(input, cf->coeffs, width, input_step, 1);
642                                 quantize_intra(cf->coeffs, cf->de_coeffs);
643                                 blocktype = IBLOCK;
644                         } else {
645                                 /* inter code */
646                                 encoding |= FRAME_PCODED;
647                                 fwht16(deltablock, cf->coeffs, 8, 0);
648                                 quantize_inter(cf->coeffs, cf->de_coeffs);
649                         }
650                         if (!next_is_intra) {
651                                 ifwht(cf->de_coeffs, cf->de_fwht, blocktype);
652
653                                 if (blocktype == PBLOCK)
654                                         add_deltas(cf->de_fwht, refp, 8);
655                                 fill_decoder_block(refp, cf->de_fwht, 8);
656                         }
657
658                         input += 8 * input_step;
659                         refp += 8 * 8;
660
661                         if (encoding & FRAME_UNENCODED)
662                                 continue;
663
664                         size = rlc(cf->coeffs, *rlco, blocktype);
665                         if (last_size == size &&
666                             !memcmp(*rlco + 1, *rlco - size + 1, 2 * size - 2)) {
667                                 __be16 *last_rlco = *rlco - size;
668                                 s16 hdr = ntohs(*last_rlco);
669
670                                 if (!((*last_rlco ^ **rlco) & pframe_bit) &&
671                                     (hdr & DUPS_MASK) < DUPS_MASK)
672                                         *last_rlco = htons(hdr + 2);
673                                 else
674                                         *rlco += size;
675                         } else {
676                                 *rlco += size;
677                         }
678                         if (*rlco >= rlco_max)
679                                 encoding |= FRAME_UNENCODED;
680                         last_size = size;
681                 }
682                 input += width * 7 * input_step;
683         }
684         if (encoding & FRAME_UNENCODED) {
685                 u8 *out = (u8 *)rlco_start;
686
687                 input = input_start;
688                 /*
689                  * The compressed stream should never contain the magic
690                  * header, so when we copy the YUV data we replace 0xff
691                  * by 0xfe. Since YUV is limited range such values
692                  * shouldn't appear anyway.
693                  */
694                 for (i = 0; i < height * width; i++, input += input_step)
695                         *out++ = (*input == 0xff) ? 0xfe : *input;
696                 *rlco = (__be16 *)out;
697         }
698         return encoding;
699 }
700
701 u32 encode_frame(struct raw_frame *frm, struct raw_frame *ref_frm,
702                  struct cframe *cf, bool is_intra, bool next_is_intra)
703 {
704         unsigned int size = frm->height * frm->width;
705         __be16 *rlco = cf->rlc_data;
706         __be16 *rlco_max;
707         u32 encoding;
708
709         rlco_max = rlco + size / 2 - 256;
710         encoding = encode_plane(frm->luma, ref_frm->luma, &rlco, rlco_max, cf,
711                                   frm->height, frm->width,
712                                   1, is_intra, next_is_intra);
713         if (encoding & FRAME_UNENCODED)
714                 encoding |= LUMA_UNENCODED;
715         encoding &= ~FRAME_UNENCODED;
716         rlco_max = rlco + size / 8 - 256;
717         encoding |= encode_plane(frm->cb, ref_frm->cb, &rlco, rlco_max, cf,
718                                    frm->height / 2, frm->width / 2,
719                                    frm->chroma_step, is_intra, next_is_intra);
720         if (encoding & FRAME_UNENCODED)
721                 encoding |= CB_UNENCODED;
722         encoding &= ~FRAME_UNENCODED;
723         rlco_max = rlco + size / 8 - 256;
724         encoding |= encode_plane(frm->cr, ref_frm->cr, &rlco, rlco_max, cf,
725                                    frm->height / 2, frm->width / 2,
726                                    frm->chroma_step, is_intra, next_is_intra);
727         if (encoding & FRAME_UNENCODED)
728                 encoding |= CR_UNENCODED;
729         encoding &= ~FRAME_UNENCODED;
730         cf->size = (rlco - cf->rlc_data) * sizeof(*rlco);
731         return encoding;
732 }
733
734 static void decode_plane(struct cframe *cf, const __be16 **rlco, u8 *ref,
735                          u32 height, u32 width, bool uncompressed)
736 {
737         unsigned int copies = 0;
738         s16 copy[8 * 8];
739         s16 stat;
740         unsigned int i, j;
741
742         if (uncompressed) {
743                 memcpy(ref, *rlco, width * height);
744                 *rlco += width * height / 2;
745                 return;
746         }
747
748         /*
749          * When decoding each macroblock the rlco pointer will be increased
750          * by 65 * 2 bytes worst-case.
751          * To avoid overflow the buffer has to be 65/64th of the actual raw
752          * image size, just in case someone feeds it malicious data.
753          */
754         for (j = 0; j < height / 8; j++) {
755                 for (i = 0; i < width / 8; i++) {
756                         u8 *refp = ref + j * 8 * width + i * 8;
757
758                         if (copies) {
759                                 memcpy(cf->de_fwht, copy, sizeof(copy));
760                                 if (stat & PFRAME_BIT)
761                                         add_deltas(cf->de_fwht, refp, width);
762                                 fill_decoder_block(refp, cf->de_fwht, width);
763                                 copies--;
764                                 continue;
765                         }
766
767                         stat = derlc(rlco, cf->coeffs);
768
769                         if (stat & PFRAME_BIT)
770                                 dequantize_inter(cf->coeffs);
771                         else
772                                 dequantize_intra(cf->coeffs);
773
774                         ifwht(cf->coeffs, cf->de_fwht,
775                               (stat & PFRAME_BIT) ? 0 : 1);
776
777                         copies = (stat & DUPS_MASK) >> 1;
778                         if (copies)
779                                 memcpy(copy, cf->de_fwht, sizeof(copy));
780                         if (stat & PFRAME_BIT)
781                                 add_deltas(cf->de_fwht, refp, width);
782                         fill_decoder_block(refp, cf->de_fwht, width);
783                 }
784         }
785 }
786
787 void decode_frame(struct cframe *cf, struct raw_frame *ref, u32 hdr_flags)
788 {
789         const __be16 *rlco = cf->rlc_data;
790
791         decode_plane(cf, &rlco, ref->luma, cf->height, cf->width,
792                      hdr_flags & VICODEC_FL_LUMA_IS_UNCOMPRESSED);
793         decode_plane(cf, &rlco, ref->cb, cf->height / 2, cf->width / 2,
794                      hdr_flags & VICODEC_FL_CB_IS_UNCOMPRESSED);
795         decode_plane(cf, &rlco, ref->cr, cf->height / 2, cf->width / 2,
796                      hdr_flags & VICODEC_FL_CR_IS_UNCOMPRESSED);
797 }