[linux-block.git] / lib / crypto / gf128mul.c

/* gf128mul.c - GF(2^128) multiplication functions
 *
 * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
 * Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
 *
 * Based on Dr Brian Gladman's (GPL'd) work published at
 * http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
 * See the original copyright notice below.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.
 */

/*
 ---------------------------------------------------------------------------
 Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.

 LICENSE TERMS

 The free distribution and use of this software in both source and binary
 form is allowed (with or without changes) provided that:

   1. distributions of this source code include the above copyright
      notice, this list of conditions and the following disclaimer;

   2. distributions in binary form include the above copyright
      notice, this list of conditions and the following disclaimer
      in the documentation and/or other associated materials;

   3. the copyright holder's name is not used to endorse products
      built using this software without specific written permission.

 ALTERNATIVELY, provided that this notice is retained in full, this product
 may be distributed under the terms of the GNU General Public License (GPL),
 in which case the provisions of the GPL apply INSTEAD OF those given above.

 DISCLAIMER

 This software is provided 'as is' with no explicit or implied warranties
 in respect of its properties, including, but not limited to, correctness
 and/or fitness for purpose.
 ---------------------------------------------------------------------------
 Issue 31/01/2006

 This file provides fast multiplication in GF(2^128) as required by several
 cryptographic authentication modes
*/

#include <crypto/gf128mul.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>

#define gf128mul_dat(q) { \
	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
}

/*
 * Given a value i in 0..255 as the byte overflow when a field element
 * in GF(2^128) is multiplied by x^8, the following macro returns the
 * 16-bit value that must be XOR-ed into the low-degree end of the
 * product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1.
 *
 * There are two versions of the macro, and hence two tables: one for
 * the "be" convention where the highest-order bit is the coefficient of
 * the highest-degree polynomial term, and one for the "le" convention
 * where the highest-order bit is the coefficient of the lowest-degree
 * polynomial term.  In both cases the values are stored in CPU byte
 * endianness such that the coefficients are ordered consistently across
 * bytes, i.e. in the "be" table bits 15..0 of the stored value
 * correspond to the coefficients of x^15..x^0, and in the "le" table
 * bits 15..0 correspond to the coefficients of x^0..x^15.
 *
 * Therefore, provided that the appropriate byte endianness conversions
 * are done by the multiplication functions (and these must be in place
 * anyway to support both little endian and big endian CPUs), the "be"
 * table can be used for multiplications of both "bbe" and "ble"
 * elements, and the "le" table can be used for multiplications of both
 * "lle" and "lbe" elements.
 */

#define xda_be(i) ( \
	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
)

#define xda_le(i) ( \
	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
)

static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);

/*
 * The following functions multiply a field element by x^8 in
 * the polynomial field representation.  They use 64-bit word operations
 * to gain speed but compensate for machine endianness and hence work
 * correctly on both styles of machine.
 */

static void gf128mul_x8_lle(be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_le[b & 0xff];

	x->b = cpu_to_be64((b >> 8) | (a << 56));
	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
}

/* time invariant version of gf128mul_x8_lle */
static void gf128mul_x8_lle_ti(be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = xda_le(b & 0xff); /* avoid table lookup */

	x->b = cpu_to_be64((b >> 8) | (a << 56));
	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
}

static void gf128mul_x8_bbe(be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_be[a >> 56];

	x->a = cpu_to_be64((a << 8) | (b >> 56));
	x->b = cpu_to_be64((b << 8) ^ _tt);
}

void gf128mul_x8_ble(le128 *r, const le128 *x)
{
	u64 a = le64_to_cpu(x->a);
	u64 b = le64_to_cpu(x->b);
	u64 _tt = gf128mul_table_be[a >> 56];

	r->a = cpu_to_le64((a << 8) | (b >> 56));
	r->b = cpu_to_le64((b << 8) ^ _tt);
}
EXPORT_SYMBOL(gf128mul_x8_ble);

void gf128mul_lle(be128 *r, const be128 *b)
{
	/*
	 * The p array should be aligned to twice the size of its element type,
	 * so that every even/odd pair is guaranteed to share a cacheline
	 * (assuming a cacheline size of 32 bytes or more, which is by far the
	 * most common). This ensures that each be128_xor() call in the loop
	 * takes the same amount of time regardless of the value of 'ch', which
	 * is derived from function parameter 'b', which is commonly used as a
	 * key, e.g., for GHASH. The odd array elements are all set to zero,
	 * making each be128_xor() a NOP if its associated bit in 'ch' is not
	 * set, and this is equivalent to calling be128_xor() conditionally.
	 * This approach aims to avoid leaking information about such keys
	 * through execution time variances.
	 *
	 * Unfortunately, __aligned(16) or higher does not work on x86 for
	 * variables on the stack so we need to perform the alignment by hand.
	 */
	be128 array[16 + 3] = {};
	be128 *p = PTR_ALIGN(&array[0], 2 * sizeof(be128));
	int i;

	p[0] = *r;
	for (i = 0; i < 7; ++i)
		gf128mul_x_lle(&p[2 * i + 2], &p[2 * i]);

	memset(r, 0, sizeof(*r));
	for (i = 0;;) {
		u8 ch = ((u8 *)b)[15 - i];

		be128_xor(r, r, &p[ 0 + !(ch & 0x80)]);
		be128_xor(r, r, &p[ 2 + !(ch & 0x40)]);
		be128_xor(r, r, &p[ 4 + !(ch & 0x20)]);
		be128_xor(r, r, &p[ 6 + !(ch & 0x10)]);
		be128_xor(r, r, &p[ 8 + !(ch & 0x08)]);
		be128_xor(r, r, &p[10 + !(ch & 0x04)]);
		be128_xor(r, r, &p[12 + !(ch & 0x02)]);
		be128_xor(r, r, &p[14 + !(ch & 0x01)]);

		if (++i >= 16)
			break;

		gf128mul_x8_lle_ti(r); /* use the time invariant version */
	}
}
EXPORT_SYMBOL(gf128mul_lle);

void gf128mul_bbe(be128 *r, const be128 *b)
{
	be128 p[8];
	int i;

	p[0] = *r;
	for (i = 0; i < 7; ++i)
		gf128mul_x_bbe(&p[i + 1], &p[i]);

	memset(r, 0, sizeof(*r));
	for (i = 0;;) {
		u8 ch = ((u8 *)b)[i];

		if (ch & 0x80)
			be128_xor(r, r, &p[7]);
		if (ch & 0x40)
			be128_xor(r, r, &p[6]);
		if (ch & 0x20)
			be128_xor(r, r, &p[5]);
		if (ch & 0x10)
			be128_xor(r, r, &p[4]);
		if (ch & 0x08)
			be128_xor(r, r, &p[3]);
		if (ch & 0x04)
			be128_xor(r, r, &p[2]);
		if (ch & 0x02)
			be128_xor(r, r, &p[1]);
		if (ch & 0x01)
			be128_xor(r, r, &p[0]);

		if (++i >= 16)
			break;

		gf128mul_x8_bbe(r);
	}
}
EXPORT_SYMBOL(gf128mul_bbe);

/*      This version uses 64k bytes of table space.
    A 16 byte buffer has to be multiplied by a 16 byte key
    value in GF(2^128).  If we consider a GF(2^128) value in
    the buffer's lowest byte, we can construct a table of
    the 256 16 byte values that result from the 256 values
    of this byte.  This requires 4096 bytes. But we also
    need tables for each of the 16 higher bytes in the
    buffer as well, which makes 64 kbytes in total.
*/
/* additional explanation
 * t[0][BYTE] contains g*BYTE
 * t[1][BYTE] contains g*x^8*BYTE
 *  ..
 * t[15][BYTE] contains g*x^120*BYTE */
struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g)
{
	struct gf128mul_64k *t;
	int i, j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	for (i = 0; i < 16; i++) {
		t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
		if (!t->t[i]) {
			gf128mul_free_64k(t);
			t = NULL;
			goto out;
		}
	}

	t->t[0]->t[1] = *g;
	for (j = 1; j <= 64; j <<= 1)
		gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);

	for (i = 0;;) {
		for (j = 2; j < 256; j += j)
			for (k = 1; k < j; ++k)
				be128_xor(&t->t[i]->t[j + k],
					  &t->t[i]->t[j], &t->t[i]->t[k]);

		if (++i >= 16)
			break;

		for (j = 128; j > 0; j >>= 1) {
			t->t[i]->t[j] = t->t[i - 1]->t[j];
			gf128mul_x8_bbe(&t->t[i]->t[j]);
		}
	}

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_64k_bbe);

void gf128mul_free_64k(struct gf128mul_64k *t)
{
	int i;

	for (i = 0; i < 16; i++)
		kfree_sensitive(t->t[i]);
	kfree_sensitive(t);
}
EXPORT_SYMBOL(gf128mul_free_64k);

void gf128mul_64k_bbe(be128 *a, const struct gf128mul_64k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i;

	*r = t->t[0]->t[ap[15]];
	for (i = 1; i < 16; ++i)
		be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_64k_bbe);

/*      This version uses 4k bytes of table space.
    A 16 byte buffer has to be multiplied by a 16 byte key
    value in GF(2^128).  If we consider a GF(2^128) value in a
    single byte, we can construct a table of the 256 16 byte
    values that result from the 256 values of this byte.
    This requires 4096 bytes. If we take the highest byte in
    the buffer and use this table to get the result, we then
    have to multiply by x^120 to get the final value. For the
    next highest byte the result has to be multiplied by x^112
    and so on. But we can do this by accumulating the result
    in an accumulator starting with the result for the top
    byte.  We repeatedly multiply the accumulator value by
    x^8 and then add in (i.e. xor) the 16 bytes of the next
    lower byte in the buffer, stopping when we reach the
    lowest byte. This requires a 4096 byte table.
*/
struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g)
{
	struct gf128mul_4k *t;
	int j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	t->t[128] = *g;
	for (j = 64; j > 0; j >>= 1)
		gf128mul_x_lle(&t->t[j], &t->t[j+j]);

	for (j = 2; j < 256; j += j)
		for (k = 1; k < j; ++k)
			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_4k_lle);

struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g)
{
	struct gf128mul_4k *t;
	int j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	t->t[1] = *g;
	for (j = 1; j <= 64; j <<= 1)
		gf128mul_x_bbe(&t->t[j + j], &t->t[j]);

	for (j = 2; j < 256; j += j)
		for (k = 1; k < j; ++k)
			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_4k_bbe);

void gf128mul_4k_lle(be128 *a, const struct gf128mul_4k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i = 15;

	*r = t->t[ap[15]];
	while (i--) {
		gf128mul_x8_lle(r);
		be128_xor(r, r, &t->t[ap[i]]);
	}
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_4k_lle);

void gf128mul_4k_bbe(be128 *a, const struct gf128mul_4k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i = 0;

	*r = t->t[ap[0]];
	while (++i < 16) {
		gf128mul_x8_bbe(r);
		be128_xor(r, r, &t->t[ap[i]]);
	}
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_4k_bbe);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
Commit	Line	Data
c494e070 RS	1	/* gf128mul.c - GF(2^128) multiplication functions
	2	*
	3	* Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
	4	* Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
	5	*
	6	* Based on Dr Brian Gladman's (GPL'd) work published at
8c882f64	7	* http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
c494e070 RS	8	* See the original copyright notice below.
	9	*
	10	* This program is free software; you can redistribute it and/or modify it
	11	* under the terms of the GNU General Public License as published by the Free
	12	* Software Foundation; either version 2 of the License, or (at your option)
	13	* any later version.
	14	*/
	15
	16	/*
	17	---------------------------------------------------------------------------
	18	Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
	19
	20	LICENSE TERMS
	21
	22	The free distribution and use of this software in both source and binary
	23	form is allowed (with or without changes) provided that:
	24
	25	1. distributions of this source code include the above copyright
	26	notice, this list of conditions and the following disclaimer;
	27
	28	2. distributions in binary form include the above copyright
	29	notice, this list of conditions and the following disclaimer
	30	in the documentation and/or other associated materials;
	31
	32	3. the copyright holder's name is not used to endorse products
	33	built using this software without specific written permission.
	34
	35	ALTERNATIVELY, provided that this notice is retained in full, this product
	36	may be distributed under the terms of the GNU General Public License (GPL),
	37	in which case the provisions of the GPL apply INSTEAD OF those given above.
	38
	39	DISCLAIMER
	40
	41	This software is provided 'as is' with no explicit or implied warranties
	42	in respect of its properties, including, but not limited to, correctness
	43	and/or fitness for purpose.
	44	---------------------------------------------------------------------------
	45	Issue 31/01/2006
	46
63be5b53	47	This file provides fast multiplication in GF(2^128) as required by several
c494e070 RS	48	cryptographic authentication modes
	49	*/
	50
	51	#include <crypto/gf128mul.h>
	52	#include <linux/kernel.h>
	53	#include <linux/module.h>
	54	#include <linux/slab.h>
	55
	56	#define gf128mul_dat(q) { \
	57	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
	58	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
	59	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
	60	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
	61	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
	62	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
	63	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
	64	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
	65	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
	66	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
	67	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
	68	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
	69	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
	70	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
	71	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
	72	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
	73	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
	74	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
	75	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
	76	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
	77	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
	78	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
	79	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
	80	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
	81	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
	82	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
	83	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
	84	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
	85	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
	86	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
	87	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
	88	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
	89	}
	90
f33fd647 EB	91	/*
	92	* Given a value i in 0..255 as the byte overflow when a field element
	93	* in GF(2^128) is multiplied by x^8, the following macro returns the
	94	* 16-bit value that must be XOR-ed into the low-degree end of the
	95	* product to reduce it modulo the polynomial x^128 + x^7 + x^2 + x + 1.
	96	*
	97	* There are two versions of the macro, and hence two tables: one for
	98	* the "be" convention where the highest-order bit is the coefficient of
	99	* the highest-degree polynomial term, and one for the "le" convention
	100	* where the highest-order bit is the coefficient of the lowest-degree
	101	* polynomial term. In both cases the values are stored in CPU byte
	102	* endianness such that the coefficients are ordered consistently across
	103	* bytes, i.e. in the "be" table bits 15..0 of the stored value
	104	* correspond to the coefficients of x^15..x^0, and in the "le" table
	105	* bits 15..0 correspond to the coefficients of x^0..x^15.
	106	*
	107	* Therefore, provided that the appropriate byte endianness conversions
	108	* are done by the multiplication functions (and these must be in place
	109	* anyway to support both little endian and big endian CPUs), the "be"
	110	* table can be used for multiplications of both "bbe" and "ble"
	111	* elements, and the "le" table can be used for multiplications of both
	112	* "lle" and "lbe" elements.
	113	*/
c494e070	114
f33fd647	115	#define xda_be(i) ( \
2416e4fa EB	116	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
	117	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
	118	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
	119	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
c494e070 RS	120	)
c494e070 RS	121
f33fd647	122	#define xda_le(i) ( \
2416e4fa EB	123	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
	124	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
	125	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
	126	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
c494e070 RS	127	)
c494e070 RS	128
f33fd647 EB	129	static const u16 gf128mul_table_le[256] = gf128mul_dat(xda_le);
f33fd647 EB	130	static const u16 gf128mul_table_be[256] = gf128mul_dat(xda_be);
c494e070	131
63be5b53	132	/*
acb9b159	133	* The following functions multiply a field element by x^8 in
63be5b53 EB	134	* the polynomial field representation. They use 64-bit word operations
63be5b53 EB	135	* to gain speed but compensate for machine endianness and hence work
c494e070 RS	136	* correctly on both styles of machine.
	137	*/
	138
c494e070 RS	139	static void gf128mul_x8_lle(be128 *x)
	140	{
	141	u64 a = be64_to_cpu(x->a);
	142	u64 b = be64_to_cpu(x->b);
f33fd647	143	u64 _tt = gf128mul_table_le[b & 0xff];
c494e070 RS	144
	145	x->b = cpu_to_be64((b >> 8) \| (a << 56));
	146	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
	147	}
	148
b67ce439 AB	149	/* time invariant version of gf128mul_x8_lle */
	150	static void gf128mul_x8_lle_ti(be128 *x)
	151	{
	152	u64 a = be64_to_cpu(x->a);
	153	u64 b = be64_to_cpu(x->b);
	154	u64 _tt = xda_le(b & 0xff); /* avoid table lookup */
	155
	156	x->b = cpu_to_be64((b >> 8) \| (a << 56));
	157	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
	158	}
	159
c494e070 RS	160	static void gf128mul_x8_bbe(be128 *x)
	161	{
	162	u64 a = be64_to_cpu(x->a);
	163	u64 b = be64_to_cpu(x->b);
f33fd647	164	u64 _tt = gf128mul_table_be[a >> 56];
c494e070 RS	165
	166	x->a = cpu_to_be64((a << 8) \| (b >> 56));
	167	x->b = cpu_to_be64((b << 8) ^ _tt);
	168	}
	169
acfc5878 HJ	170	void gf128mul_x8_ble(le128 r, const le128 x)
	171	{
	172	u64 a = le64_to_cpu(x->a);
	173	u64 b = le64_to_cpu(x->b);
acfc5878 HJ	174	u64 _tt = gf128mul_table_be[a >> 56];
	175
	176	r->a = cpu_to_le64((a << 8) \| (b >> 56));
	177	r->b = cpu_to_le64((b << 8) ^ _tt);
	178	}
	179	EXPORT_SYMBOL(gf128mul_x8_ble);
	180
c494e070 RS	181	void gf128mul_lle(be128 r, const be128 b)
c494e070 RS	182	{
b67ce439 AB	183	/*
	184	* The p array should be aligned to twice the size of its element type,
	185	* so that every even/odd pair is guaranteed to share a cacheline
	186	* (assuming a cacheline size of 32 bytes or more, which is by far the
	187	* most common). This ensures that each be128_xor() call in the loop
	188	* takes the same amount of time regardless of the value of 'ch', which
	189	* is derived from function parameter 'b', which is commonly used as a
	190	* key, e.g., for GHASH. The odd array elements are all set to zero,
	191	* making each be128_xor() a NOP if its associated bit in 'ch' is not
	192	* set, and this is equivalent to calling be128_xor() conditionally.
	193	* This approach aims to avoid leaking information about such keys
	194	* through execution time variances.
	195	*
	196	* Unfortunately, __aligned(16) or higher does not work on x86 for
	197	* variables on the stack so we need to perform the alignment by hand.
	198	*/
	199	be128 array[16 + 3] = {};
	200	be128 p = PTR_ALIGN(&array[0], 2 sizeof(be128));
c494e070 RS	201	int i;
	202
	203	p[0] = *r;
	204	for (i = 0; i < 7; ++i)
b67ce439	205	gf128mul_x_lle(&p[2 * i + 2], &p[2 * i]);
c494e070	206
62542663	207	memset(r, 0, sizeof(*r));
c494e070 RS	208	for (i = 0;;) {
	209	u8 ch = ((u8 *)b)[15 - i];
	210
b67ce439 AB	211	be128_xor(r, r, &p[ 0 + !(ch & 0x80)]);
	212	be128_xor(r, r, &p[ 2 + !(ch & 0x40)]);
	213	be128_xor(r, r, &p[ 4 + !(ch & 0x20)]);
	214	be128_xor(r, r, &p[ 6 + !(ch & 0x10)]);
	215	be128_xor(r, r, &p[ 8 + !(ch & 0x08)]);
	216	be128_xor(r, r, &p[10 + !(ch & 0x04)]);
	217	be128_xor(r, r, &p[12 + !(ch & 0x02)]);
	218	be128_xor(r, r, &p[14 + !(ch & 0x01)]);
c494e070 RS	219
	220	if (++i >= 16)
	221	break;
	222
b67ce439	223	gf128mul_x8_lle_ti(r); /* use the time invariant version */
c494e070 RS	224	}
	225	}
	226	EXPORT_SYMBOL(gf128mul_lle);
	227
	228	void gf128mul_bbe(be128 r, const be128 b)
	229	{
	230	be128 p[8];
	231	int i;
	232
	233	p[0] = *r;
	234	for (i = 0; i < 7; ++i)
	235	gf128mul_x_bbe(&p[i + 1], &p[i]);
	236
62542663	237	memset(r, 0, sizeof(*r));
c494e070 RS	238	for (i = 0;;) {
	239	u8 ch = ((u8 *)b)[i];
	240
	241	if (ch & 0x80)
	242	be128_xor(r, r, &p[7]);
	243	if (ch & 0x40)
	244	be128_xor(r, r, &p[6]);
	245	if (ch & 0x20)
	246	be128_xor(r, r, &p[5]);
	247	if (ch & 0x10)
	248	be128_xor(r, r, &p[4]);
	249	if (ch & 0x08)
	250	be128_xor(r, r, &p[3]);
	251	if (ch & 0x04)
	252	be128_xor(r, r, &p[2]);
	253	if (ch & 0x02)
	254	be128_xor(r, r, &p[1]);
	255	if (ch & 0x01)
	256	be128_xor(r, r, &p[0]);
	257
	258	if (++i >= 16)
	259	break;
	260
	261	gf128mul_x8_bbe(r);
	262	}
	263	}
	264	EXPORT_SYMBOL(gf128mul_bbe);
	265
	266	/* This version uses 64k bytes of table space.
	267	A 16 byte buffer has to be multiplied by a 16 byte key
63be5b53	268	value in GF(2^128). If we consider a GF(2^128) value in
c494e070 RS	269	the buffer's lowest byte, we can construct a table of
	270	the 256 16 byte values that result from the 256 values
	271	of this byte. This requires 4096 bytes. But we also
	272	need tables for each of the 16 higher bytes in the
	273	buffer as well, which makes 64 kbytes in total.
	274	*/
	275	/* additional explanation
	276	* t[0][BYTE] contains g*BYTE
	277	* t[1][BYTE] contains gx^8BYTE
	278	* ..
	279	* t[15][BYTE] contains gx^120BYTE */
c494e070 RS	280	struct gf128mul_64k gf128mul_init_64k_bbe(const be128 g)
	281	{
	282	struct gf128mul_64k *t;
	283	int i, j, k;
	284
	285	t = kzalloc(sizeof(*t), GFP_KERNEL);
	286	if (!t)
	287	goto out;
	288
	289	for (i = 0; i < 16; i++) {
	290	t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
	291	if (!t->t[i]) {
	292	gf128mul_free_64k(t);
	293	t = NULL;
	294	goto out;
	295	}
	296	}
	297
	298	t->t[0]->t[1] = *g;
	299	for (j = 1; j <= 64; j <<= 1)
	300	gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);
	301
	302	for (i = 0;;) {
	303	for (j = 2; j < 256; j += j)
	304	for (k = 1; k < j; ++k)
	305	be128_xor(&t->t[i]->t[j + k],
	306	&t->t[i]->t[j], &t->t[i]->t[k]);
	307
	308	if (++i >= 16)
	309	break;
	310
	311	for (j = 128; j > 0; j >>= 1) {
	312	t->t[i]->t[j] = t->t[i - 1]->t[j];
	313	gf128mul_x8_bbe(&t->t[i]->t[j]);
	314	}
	315	}
	316
	317	out:
	318	return t;
	319	}
	320	EXPORT_SYMBOL(gf128mul_init_64k_bbe);
	321
	322	void gf128mul_free_64k(struct gf128mul_64k *t)
	323	{
	324	int i;
	325
	326	for (i = 0; i < 16; i++)
453431a5 WL	327	kfree_sensitive(t->t[i]);
453431a5 WL	328	kfree_sensitive(t);
c494e070 RS	329	}
	330	EXPORT_SYMBOL(gf128mul_free_64k);
	331
3ea996dd	332	void gf128mul_64k_bbe(be128 a, const struct gf128mul_64k t)
c494e070 RS	333	{
	334	u8 ap = (u8 )a;
	335	be128 r[1];
	336	int i;
	337
	338	*r = t->t[0]->t[ap[15]];
	339	for (i = 1; i < 16; ++i)
	340	be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
	341	a = r;
	342	}
	343	EXPORT_SYMBOL(gf128mul_64k_bbe);
	344
	345	/* This version uses 4k bytes of table space.
	346	A 16 byte buffer has to be multiplied by a 16 byte key
63be5b53	347	value in GF(2^128). If we consider a GF(2^128) value in a
c494e070 RS	348	single byte, we can construct a table of the 256 16 byte
	349	values that result from the 256 values of this byte.
	350	This requires 4096 bytes. If we take the highest byte in
	351	the buffer and use this table to get the result, we then
	352	have to multiply by x^120 to get the final value. For the
	353	next highest byte the result has to be multiplied by x^112
	354	and so on. But we can do this by accumulating the result
	355	in an accumulator starting with the result for the top
	356	byte. We repeatedly multiply the accumulator value by
	357	x^8 and then add in (i.e. xor) the 16 bytes of the next
	358	lower byte in the buffer, stopping when we reach the
	359	lowest byte. This requires a 4096 byte table.
	360	*/
	361	struct gf128mul_4k gf128mul_init_4k_lle(const be128 g)
	362	{
	363	struct gf128mul_4k *t;
	364	int j, k;
	365
	366	t = kzalloc(sizeof(*t), GFP_KERNEL);
	367	if (!t)
	368	goto out;
	369
	370	t->t[128] = *g;
	371	for (j = 64; j > 0; j >>= 1)
	372	gf128mul_x_lle(&t->t[j], &t->t[j+j]);
	373
	374	for (j = 2; j < 256; j += j)
	375	for (k = 1; k < j; ++k)
	376	be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
	377
	378	out:
	379	return t;
	380	}
	381	EXPORT_SYMBOL(gf128mul_init_4k_lle);
	382
	383	struct gf128mul_4k gf128mul_init_4k_bbe(const be128 g)
	384	{
	385	struct gf128mul_4k *t;
	386	int j, k;
	387
	388	t = kzalloc(sizeof(*t), GFP_KERNEL);
	389	if (!t)
	390	goto out;
	391
	392	t->t[1] = *g;
	393	for (j = 1; j <= 64; j <<= 1)
	394	gf128mul_x_bbe(&t->t[j + j], &t->t[j]);
	395
	396	for (j = 2; j < 256; j += j)
	397	for (k = 1; k < j; ++k)
	398	be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
	399
	400	out:
	401	return t;
	402	}
	403	EXPORT_SYMBOL(gf128mul_init_4k_bbe);
	404
3ea996dd	405	void gf128mul_4k_lle(be128 a, const struct gf128mul_4k t)
c494e070 RS	406	{
	407	u8 ap = (u8 )a;
	408	be128 r[1];
	409	int i = 15;
	410
	411	*r = t->t[ap[15]];
	412	while (i--) {
	413	gf128mul_x8_lle(r);
	414	be128_xor(r, r, &t->t[ap[i]]);
	415	}
	416	a = r;
	417	}
	418	EXPORT_SYMBOL(gf128mul_4k_lle);
	419
3ea996dd	420	void gf128mul_4k_bbe(be128 a, const struct gf128mul_4k t)
c494e070 RS	421	{
	422	u8 ap = (u8 )a;
	423	be128 r[1];
	424	int i = 0;
	425
	426	*r = t->t[ap[0]];
	427	while (++i < 16) {
	428	gf128mul_x8_bbe(r);
	429	be128_xor(r, r, &t->t[ap[i]]);
	430	}
	431	a = r;
	432	}
	433	EXPORT_SYMBOL(gf128mul_4k_bbe);
	434
	435	MODULE_LICENSE("GPL");
	436	MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");