[linux-2.6-block.git] / crypto / gf128mul.c

/* gf128mul.c - GF(2^128) multiplication functions
 *
 * Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
 * Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
 *
 * Based on Dr Brian Gladman's (GPL'd) work published at
 * http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
 * See the original copyright notice below.
 *
 * This program is free software; you can redistribute it and/or modify it
 * under the terms of the GNU General Public License as published by the Free
 * Software Foundation; either version 2 of the License, or (at your option)
 * any later version.
 */

/*
 ---------------------------------------------------------------------------
 Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.   All rights reserved.

 LICENSE TERMS

 The free distribution and use of this software in both source and binary
 form is allowed (with or without changes) provided that:

   1. distributions of this source code include the above copyright
      notice, this list of conditions and the following disclaimer;

   2. distributions in binary form include the above copyright
      notice, this list of conditions and the following disclaimer
      in the documentation and/or other associated materials;

   3. the copyright holder's name is not used to endorse products
      built using this software without specific written permission.

 ALTERNATIVELY, provided that this notice is retained in full, this product
 may be distributed under the terms of the GNU General Public License (GPL),
 in which case the provisions of the GPL apply INSTEAD OF those given above.

 DISCLAIMER

 This software is provided 'as is' with no explicit or implied warranties
 in respect of its properties, including, but not limited to, correctness
 and/or fitness for purpose.
 ---------------------------------------------------------------------------
 Issue 31/01/2006

 This file provides fast multiplication in GF(2^128) as required by several
 cryptographic authentication modes
*/

#include <crypto/gf128mul.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/slab.h>

#define gf128mul_dat(q) { \
	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
}

/*	Given the value i in 0..255 as the byte overflow when a field element
    in GHASH is multiplied by x^8, this function will return the values that
    are generated in the lo 16-bit word of the field value by applying the
    modular polynomial. The values lo_byte and hi_byte are returned via the
    macro xp_fun(lo_byte, hi_byte) so that the values can be assembled into
    memory as required by a suitable definition of this macro operating on
    the table above
*/

#define xda_bbe(i) ( \
	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
)

#define xda_lle(i) ( \
	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
)

static const u16 gf128mul_table_lle[256] = gf128mul_dat(xda_lle);
static const u16 gf128mul_table_bbe[256] = gf128mul_dat(xda_bbe);

/*
 * The following functions multiply a field element by x or by x^8 in
 * the polynomial field representation.  They use 64-bit word operations
 * to gain speed but compensate for machine endianness and hence work
 * correctly on both styles of machine.
 */

static void gf128mul_x_lle(be128 *r, const be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_lle[(b << 7) & 0xff];

	r->b = cpu_to_be64((b >> 1) | (a << 63));
	r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
}

static void gf128mul_x_bbe(be128 *r, const be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_bbe[a >> 63];

	r->a = cpu_to_be64((a << 1) | (b >> 63));
	r->b = cpu_to_be64((b << 1) ^ _tt);
}

void gf128mul_x_ble(be128 *r, const be128 *x)
{
	u64 a = le64_to_cpu(x->a);
	u64 b = le64_to_cpu(x->b);
	u64 _tt = gf128mul_table_bbe[b >> 63];

	r->a = cpu_to_le64((a << 1) ^ _tt);
	r->b = cpu_to_le64((b << 1) | (a >> 63));
}
EXPORT_SYMBOL(gf128mul_x_ble);

static void gf128mul_x8_lle(be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_lle[b & 0xff];

	x->b = cpu_to_be64((b >> 8) | (a << 56));
	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
}

static void gf128mul_x8_bbe(be128 *x)
{
	u64 a = be64_to_cpu(x->a);
	u64 b = be64_to_cpu(x->b);
	u64 _tt = gf128mul_table_bbe[a >> 56];

	x->a = cpu_to_be64((a << 8) | (b >> 56));
	x->b = cpu_to_be64((b << 8) ^ _tt);
}

void gf128mul_lle(be128 *r, const be128 *b)
{
	be128 p[8];
	int i;

	p[0] = *r;
	for (i = 0; i < 7; ++i)
		gf128mul_x_lle(&p[i + 1], &p[i]);

	memset(r, 0, sizeof(*r));
	for (i = 0;;) {
		u8 ch = ((u8 *)b)[15 - i];

		if (ch & 0x80)
			be128_xor(r, r, &p[0]);
		if (ch & 0x40)
			be128_xor(r, r, &p[1]);
		if (ch & 0x20)
			be128_xor(r, r, &p[2]);
		if (ch & 0x10)
			be128_xor(r, r, &p[3]);
		if (ch & 0x08)
			be128_xor(r, r, &p[4]);
		if (ch & 0x04)
			be128_xor(r, r, &p[5]);
		if (ch & 0x02)
			be128_xor(r, r, &p[6]);
		if (ch & 0x01)
			be128_xor(r, r, &p[7]);

		if (++i >= 16)
			break;

		gf128mul_x8_lle(r);
	}
}
EXPORT_SYMBOL(gf128mul_lle);

void gf128mul_bbe(be128 *r, const be128 *b)
{
	be128 p[8];
	int i;

	p[0] = *r;
	for (i = 0; i < 7; ++i)
		gf128mul_x_bbe(&p[i + 1], &p[i]);

	memset(r, 0, sizeof(*r));
	for (i = 0;;) {
		u8 ch = ((u8 *)b)[i];

		if (ch & 0x80)
			be128_xor(r, r, &p[7]);
		if (ch & 0x40)
			be128_xor(r, r, &p[6]);
		if (ch & 0x20)
			be128_xor(r, r, &p[5]);
		if (ch & 0x10)
			be128_xor(r, r, &p[4]);
		if (ch & 0x08)
			be128_xor(r, r, &p[3]);
		if (ch & 0x04)
			be128_xor(r, r, &p[2]);
		if (ch & 0x02)
			be128_xor(r, r, &p[1]);
		if (ch & 0x01)
			be128_xor(r, r, &p[0]);

		if (++i >= 16)
			break;

		gf128mul_x8_bbe(r);
	}
}
EXPORT_SYMBOL(gf128mul_bbe);

/*      This version uses 64k bytes of table space.
    A 16 byte buffer has to be multiplied by a 16 byte key
    value in GF(2^128).  If we consider a GF(2^128) value in
    the buffer's lowest byte, we can construct a table of
    the 256 16 byte values that result from the 256 values
    of this byte.  This requires 4096 bytes. But we also
    need tables for each of the 16 higher bytes in the
    buffer as well, which makes 64 kbytes in total.
*/
/* additional explanation
 * t[0][BYTE] contains g*BYTE
 * t[1][BYTE] contains g*x^8*BYTE
 *  ..
 * t[15][BYTE] contains g*x^120*BYTE */
struct gf128mul_64k *gf128mul_init_64k_bbe(const be128 *g)
{
	struct gf128mul_64k *t;
	int i, j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	for (i = 0; i < 16; i++) {
		t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
		if (!t->t[i]) {
			gf128mul_free_64k(t);
			t = NULL;
			goto out;
		}
	}

	t->t[0]->t[1] = *g;
	for (j = 1; j <= 64; j <<= 1)
		gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);

	for (i = 0;;) {
		for (j = 2; j < 256; j += j)
			for (k = 1; k < j; ++k)
				be128_xor(&t->t[i]->t[j + k],
					  &t->t[i]->t[j], &t->t[i]->t[k]);

		if (++i >= 16)
			break;

		for (j = 128; j > 0; j >>= 1) {
			t->t[i]->t[j] = t->t[i - 1]->t[j];
			gf128mul_x8_bbe(&t->t[i]->t[j]);
		}
	}

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_64k_bbe);

void gf128mul_free_64k(struct gf128mul_64k *t)
{
	int i;

	for (i = 0; i < 16; i++)
		kzfree(t->t[i]);
	kzfree(t);
}
EXPORT_SYMBOL(gf128mul_free_64k);

void gf128mul_64k_bbe(be128 *a, struct gf128mul_64k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i;

	*r = t->t[0]->t[ap[15]];
	for (i = 1; i < 16; ++i)
		be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_64k_bbe);

/*      This version uses 4k bytes of table space.
    A 16 byte buffer has to be multiplied by a 16 byte key
    value in GF(2^128).  If we consider a GF(2^128) value in a
    single byte, we can construct a table of the 256 16 byte
    values that result from the 256 values of this byte.
    This requires 4096 bytes. If we take the highest byte in
    the buffer and use this table to get the result, we then
    have to multiply by x^120 to get the final value. For the
    next highest byte the result has to be multiplied by x^112
    and so on. But we can do this by accumulating the result
    in an accumulator starting with the result for the top
    byte.  We repeatedly multiply the accumulator value by
    x^8 and then add in (i.e. xor) the 16 bytes of the next
    lower byte in the buffer, stopping when we reach the
    lowest byte. This requires a 4096 byte table.
*/
struct gf128mul_4k *gf128mul_init_4k_lle(const be128 *g)
{
	struct gf128mul_4k *t;
	int j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	t->t[128] = *g;
	for (j = 64; j > 0; j >>= 1)
		gf128mul_x_lle(&t->t[j], &t->t[j+j]);

	for (j = 2; j < 256; j += j)
		for (k = 1; k < j; ++k)
			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_4k_lle);

struct gf128mul_4k *gf128mul_init_4k_bbe(const be128 *g)
{
	struct gf128mul_4k *t;
	int j, k;

	t = kzalloc(sizeof(*t), GFP_KERNEL);
	if (!t)
		goto out;

	t->t[1] = *g;
	for (j = 1; j <= 64; j <<= 1)
		gf128mul_x_bbe(&t->t[j + j], &t->t[j]);

	for (j = 2; j < 256; j += j)
		for (k = 1; k < j; ++k)
			be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);

out:
	return t;
}
EXPORT_SYMBOL(gf128mul_init_4k_bbe);

void gf128mul_4k_lle(be128 *a, struct gf128mul_4k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i = 15;

	*r = t->t[ap[15]];
	while (i--) {
		gf128mul_x8_lle(r);
		be128_xor(r, r, &t->t[ap[i]]);
	}
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_4k_lle);

void gf128mul_4k_bbe(be128 *a, struct gf128mul_4k *t)
{
	u8 *ap = (u8 *)a;
	be128 r[1];
	int i = 0;

	*r = t->t[ap[0]];
	while (++i < 16) {
		gf128mul_x8_bbe(r);
		be128_xor(r, r, &t->t[ap[i]]);
	}
	*a = *r;
}
EXPORT_SYMBOL(gf128mul_4k_bbe);

MODULE_LICENSE("GPL");
MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");
Commit	Line	Data
c494e070 RS	1	/* gf128mul.c - GF(2^128) multiplication functions
	2	*
	3	* Copyright (c) 2003, Dr Brian Gladman, Worcester, UK.
	4	* Copyright (c) 2006, Rik Snel <rsnel@cube.dyndns.org>
	5	*
	6	* Based on Dr Brian Gladman's (GPL'd) work published at
8c882f64	7	* http://gladman.plushost.co.uk/oldsite/cryptography_technology/index.php
c494e070 RS	8	* See the original copyright notice below.
	9	*
	10	* This program is free software; you can redistribute it and/or modify it
	11	* under the terms of the GNU General Public License as published by the Free
	12	* Software Foundation; either version 2 of the License, or (at your option)
	13	* any later version.
	14	*/
	15
	16	/*
	17	---------------------------------------------------------------------------
	18	Copyright (c) 2003, Dr Brian Gladman, Worcester, UK. All rights reserved.
	19
	20	LICENSE TERMS
	21
	22	The free distribution and use of this software in both source and binary
	23	form is allowed (with or without changes) provided that:
	24
	25	1. distributions of this source code include the above copyright
	26	notice, this list of conditions and the following disclaimer;
	27
	28	2. distributions in binary form include the above copyright
	29	notice, this list of conditions and the following disclaimer
	30	in the documentation and/or other associated materials;
	31
	32	3. the copyright holder's name is not used to endorse products
	33	built using this software without specific written permission.
	34
	35	ALTERNATIVELY, provided that this notice is retained in full, this product
	36	may be distributed under the terms of the GNU General Public License (GPL),
	37	in which case the provisions of the GPL apply INSTEAD OF those given above.
	38
	39	DISCLAIMER
	40
	41	This software is provided 'as is' with no explicit or implied warranties
	42	in respect of its properties, including, but not limited to, correctness
	43	and/or fitness for purpose.
	44	---------------------------------------------------------------------------
	45	Issue 31/01/2006
	46
63be5b53	47	This file provides fast multiplication in GF(2^128) as required by several
c494e070 RS	48	cryptographic authentication modes
	49	*/
	50
	51	#include <crypto/gf128mul.h>
	52	#include <linux/kernel.h>
	53	#include <linux/module.h>
	54	#include <linux/slab.h>
	55
	56	#define gf128mul_dat(q) { \
	57	q(0x00), q(0x01), q(0x02), q(0x03), q(0x04), q(0x05), q(0x06), q(0x07),\
	58	q(0x08), q(0x09), q(0x0a), q(0x0b), q(0x0c), q(0x0d), q(0x0e), q(0x0f),\
	59	q(0x10), q(0x11), q(0x12), q(0x13), q(0x14), q(0x15), q(0x16), q(0x17),\
	60	q(0x18), q(0x19), q(0x1a), q(0x1b), q(0x1c), q(0x1d), q(0x1e), q(0x1f),\
	61	q(0x20), q(0x21), q(0x22), q(0x23), q(0x24), q(0x25), q(0x26), q(0x27),\
	62	q(0x28), q(0x29), q(0x2a), q(0x2b), q(0x2c), q(0x2d), q(0x2e), q(0x2f),\
	63	q(0x30), q(0x31), q(0x32), q(0x33), q(0x34), q(0x35), q(0x36), q(0x37),\
	64	q(0x38), q(0x39), q(0x3a), q(0x3b), q(0x3c), q(0x3d), q(0x3e), q(0x3f),\
	65	q(0x40), q(0x41), q(0x42), q(0x43), q(0x44), q(0x45), q(0x46), q(0x47),\
	66	q(0x48), q(0x49), q(0x4a), q(0x4b), q(0x4c), q(0x4d), q(0x4e), q(0x4f),\
	67	q(0x50), q(0x51), q(0x52), q(0x53), q(0x54), q(0x55), q(0x56), q(0x57),\
	68	q(0x58), q(0x59), q(0x5a), q(0x5b), q(0x5c), q(0x5d), q(0x5e), q(0x5f),\
	69	q(0x60), q(0x61), q(0x62), q(0x63), q(0x64), q(0x65), q(0x66), q(0x67),\
	70	q(0x68), q(0x69), q(0x6a), q(0x6b), q(0x6c), q(0x6d), q(0x6e), q(0x6f),\
	71	q(0x70), q(0x71), q(0x72), q(0x73), q(0x74), q(0x75), q(0x76), q(0x77),\
	72	q(0x78), q(0x79), q(0x7a), q(0x7b), q(0x7c), q(0x7d), q(0x7e), q(0x7f),\
	73	q(0x80), q(0x81), q(0x82), q(0x83), q(0x84), q(0x85), q(0x86), q(0x87),\
	74	q(0x88), q(0x89), q(0x8a), q(0x8b), q(0x8c), q(0x8d), q(0x8e), q(0x8f),\
	75	q(0x90), q(0x91), q(0x92), q(0x93), q(0x94), q(0x95), q(0x96), q(0x97),\
	76	q(0x98), q(0x99), q(0x9a), q(0x9b), q(0x9c), q(0x9d), q(0x9e), q(0x9f),\
	77	q(0xa0), q(0xa1), q(0xa2), q(0xa3), q(0xa4), q(0xa5), q(0xa6), q(0xa7),\
	78	q(0xa8), q(0xa9), q(0xaa), q(0xab), q(0xac), q(0xad), q(0xae), q(0xaf),\
	79	q(0xb0), q(0xb1), q(0xb2), q(0xb3), q(0xb4), q(0xb5), q(0xb6), q(0xb7),\
	80	q(0xb8), q(0xb9), q(0xba), q(0xbb), q(0xbc), q(0xbd), q(0xbe), q(0xbf),\
	81	q(0xc0), q(0xc1), q(0xc2), q(0xc3), q(0xc4), q(0xc5), q(0xc6), q(0xc7),\
	82	q(0xc8), q(0xc9), q(0xca), q(0xcb), q(0xcc), q(0xcd), q(0xce), q(0xcf),\
	83	q(0xd0), q(0xd1), q(0xd2), q(0xd3), q(0xd4), q(0xd5), q(0xd6), q(0xd7),\
	84	q(0xd8), q(0xd9), q(0xda), q(0xdb), q(0xdc), q(0xdd), q(0xde), q(0xdf),\
	85	q(0xe0), q(0xe1), q(0xe2), q(0xe3), q(0xe4), q(0xe5), q(0xe6), q(0xe7),\
	86	q(0xe8), q(0xe9), q(0xea), q(0xeb), q(0xec), q(0xed), q(0xee), q(0xef),\
	87	q(0xf0), q(0xf1), q(0xf2), q(0xf3), q(0xf4), q(0xf5), q(0xf6), q(0xf7),\
	88	q(0xf8), q(0xf9), q(0xfa), q(0xfb), q(0xfc), q(0xfd), q(0xfe), q(0xff) \
	89	}
	90
	91	/* Given the value i in 0..255 as the byte overflow when a field element
25985edc	92	in GHASH is multiplied by x^8, this function will return the values that
c494e070 RS	93	are generated in the lo 16-bit word of the field value by applying the
	94	modular polynomial. The values lo_byte and hi_byte are returned via the
	95	macro xp_fun(lo_byte, hi_byte) so that the values can be assembled into
	96	memory as required by a suitable definition of this macro operating on
	97	the table above
	98	*/
	99
c494e070	100	#define xda_bbe(i) ( \
2416e4fa EB	101	(i & 0x80 ? 0x4380 : 0) ^ (i & 0x40 ? 0x21c0 : 0) ^ \
	102	(i & 0x20 ? 0x10e0 : 0) ^ (i & 0x10 ? 0x0870 : 0) ^ \
	103	(i & 0x08 ? 0x0438 : 0) ^ (i & 0x04 ? 0x021c : 0) ^ \
	104	(i & 0x02 ? 0x010e : 0) ^ (i & 0x01 ? 0x0087 : 0) \
c494e070 RS	105	)
	106
	107	#define xda_lle(i) ( \
2416e4fa EB	108	(i & 0x80 ? 0xe100 : 0) ^ (i & 0x40 ? 0x7080 : 0) ^ \
	109	(i & 0x20 ? 0x3840 : 0) ^ (i & 0x10 ? 0x1c20 : 0) ^ \
	110	(i & 0x08 ? 0x0e10 : 0) ^ (i & 0x04 ? 0x0708 : 0) ^ \
	111	(i & 0x02 ? 0x0384 : 0) ^ (i & 0x01 ? 0x01c2 : 0) \
c494e070 RS	112	)
	113
	114	static const u16 gf128mul_table_lle[256] = gf128mul_dat(xda_lle);
	115	static const u16 gf128mul_table_bbe[256] = gf128mul_dat(xda_bbe);
	116
63be5b53 EB	117	/*
	118	* The following functions multiply a field element by x or by x^8 in
	119	* the polynomial field representation. They use 64-bit word operations
	120	* to gain speed but compensate for machine endianness and hence work
c494e070 RS	121	* correctly on both styles of machine.
	122	*/
	123
	124	static void gf128mul_x_lle(be128 r, const be128 x)
	125	{
	126	u64 a = be64_to_cpu(x->a);
	127	u64 b = be64_to_cpu(x->b);
	128	u64 _tt = gf128mul_table_lle[(b << 7) & 0xff];
	129
	130	r->b = cpu_to_be64((b >> 1) \| (a << 63));
	131	r->a = cpu_to_be64((a >> 1) ^ (_tt << 48));
	132	}
	133
	134	static void gf128mul_x_bbe(be128 r, const be128 x)
	135	{
	136	u64 a = be64_to_cpu(x->a);
	137	u64 b = be64_to_cpu(x->b);
	138	u64 _tt = gf128mul_table_bbe[a >> 63];
	139
	140	r->a = cpu_to_be64((a << 1) \| (b >> 63));
	141	r->b = cpu_to_be64((b << 1) ^ _tt);
	142	}
	143
f19f5111 RS	144	void gf128mul_x_ble(be128 r, const be128 x)
	145	{
	146	u64 a = le64_to_cpu(x->a);
	147	u64 b = le64_to_cpu(x->b);
	148	u64 _tt = gf128mul_table_bbe[b >> 63];
	149
	150	r->a = cpu_to_le64((a << 1) ^ _tt);
	151	r->b = cpu_to_le64((b << 1) \| (a >> 63));
	152	}
	153	EXPORT_SYMBOL(gf128mul_x_ble);
	154
c494e070 RS	155	static void gf128mul_x8_lle(be128 *x)
	156	{
	157	u64 a = be64_to_cpu(x->a);
	158	u64 b = be64_to_cpu(x->b);
	159	u64 _tt = gf128mul_table_lle[b & 0xff];
	160
	161	x->b = cpu_to_be64((b >> 8) \| (a << 56));
	162	x->a = cpu_to_be64((a >> 8) ^ (_tt << 48));
	163	}
	164
	165	static void gf128mul_x8_bbe(be128 *x)
	166	{
	167	u64 a = be64_to_cpu(x->a);
	168	u64 b = be64_to_cpu(x->b);
	169	u64 _tt = gf128mul_table_bbe[a >> 56];
	170
	171	x->a = cpu_to_be64((a << 8) \| (b >> 56));
	172	x->b = cpu_to_be64((b << 8) ^ _tt);
	173	}
	174
	175	void gf128mul_lle(be128 r, const be128 b)
	176	{
	177	be128 p[8];
	178	int i;
	179
	180	p[0] = *r;
	181	for (i = 0; i < 7; ++i)
	182	gf128mul_x_lle(&p[i + 1], &p[i]);
	183
62542663	184	memset(r, 0, sizeof(*r));
c494e070 RS	185	for (i = 0;;) {
	186	u8 ch = ((u8 *)b)[15 - i];
	187
	188	if (ch & 0x80)
	189	be128_xor(r, r, &p[0]);
	190	if (ch & 0x40)
	191	be128_xor(r, r, &p[1]);
	192	if (ch & 0x20)
	193	be128_xor(r, r, &p[2]);
	194	if (ch & 0x10)
	195	be128_xor(r, r, &p[3]);
	196	if (ch & 0x08)
	197	be128_xor(r, r, &p[4]);
	198	if (ch & 0x04)
	199	be128_xor(r, r, &p[5]);
	200	if (ch & 0x02)
	201	be128_xor(r, r, &p[6]);
	202	if (ch & 0x01)
	203	be128_xor(r, r, &p[7]);
	204
	205	if (++i >= 16)
	206	break;
	207
	208	gf128mul_x8_lle(r);
	209	}
	210	}
	211	EXPORT_SYMBOL(gf128mul_lle);
	212
	213	void gf128mul_bbe(be128 r, const be128 b)
	214	{
	215	be128 p[8];
	216	int i;
	217
	218	p[0] = *r;
	219	for (i = 0; i < 7; ++i)
	220	gf128mul_x_bbe(&p[i + 1], &p[i]);
	221
62542663	222	memset(r, 0, sizeof(*r));
c494e070 RS	223	for (i = 0;;) {
	224	u8 ch = ((u8 *)b)[i];
	225
	226	if (ch & 0x80)
	227	be128_xor(r, r, &p[7]);
	228	if (ch & 0x40)
	229	be128_xor(r, r, &p[6]);
	230	if (ch & 0x20)
	231	be128_xor(r, r, &p[5]);
	232	if (ch & 0x10)
	233	be128_xor(r, r, &p[4]);
	234	if (ch & 0x08)
	235	be128_xor(r, r, &p[3]);
	236	if (ch & 0x04)
	237	be128_xor(r, r, &p[2]);
	238	if (ch & 0x02)
	239	be128_xor(r, r, &p[1]);
	240	if (ch & 0x01)
	241	be128_xor(r, r, &p[0]);
	242
	243	if (++i >= 16)
	244	break;
	245
	246	gf128mul_x8_bbe(r);
	247	}
	248	}
	249	EXPORT_SYMBOL(gf128mul_bbe);
	250
	251	/* This version uses 64k bytes of table space.
	252	A 16 byte buffer has to be multiplied by a 16 byte key
63be5b53	253	value in GF(2^128). If we consider a GF(2^128) value in
c494e070 RS	254	the buffer's lowest byte, we can construct a table of
	255	the 256 16 byte values that result from the 256 values
	256	of this byte. This requires 4096 bytes. But we also
	257	need tables for each of the 16 higher bytes in the
	258	buffer as well, which makes 64 kbytes in total.
	259	*/
	260	/* additional explanation
	261	* t[0][BYTE] contains g*BYTE
	262	* t[1][BYTE] contains gx^8BYTE
	263	* ..
	264	* t[15][BYTE] contains gx^120BYTE */
c494e070 RS	265	struct gf128mul_64k gf128mul_init_64k_bbe(const be128 g)
	266	{
	267	struct gf128mul_64k *t;
	268	int i, j, k;
	269
	270	t = kzalloc(sizeof(*t), GFP_KERNEL);
	271	if (!t)
	272	goto out;
	273
	274	for (i = 0; i < 16; i++) {
	275	t->t[i] = kzalloc(sizeof(*t->t[i]), GFP_KERNEL);
	276	if (!t->t[i]) {
	277	gf128mul_free_64k(t);
	278	t = NULL;
	279	goto out;
	280	}
	281	}
	282
	283	t->t[0]->t[1] = *g;
	284	for (j = 1; j <= 64; j <<= 1)
	285	gf128mul_x_bbe(&t->t[0]->t[j + j], &t->t[0]->t[j]);
	286
	287	for (i = 0;;) {
	288	for (j = 2; j < 256; j += j)
	289	for (k = 1; k < j; ++k)
	290	be128_xor(&t->t[i]->t[j + k],
	291	&t->t[i]->t[j], &t->t[i]->t[k]);
	292
	293	if (++i >= 16)
	294	break;
	295
	296	for (j = 128; j > 0; j >>= 1) {
	297	t->t[i]->t[j] = t->t[i - 1]->t[j];
	298	gf128mul_x8_bbe(&t->t[i]->t[j]);
	299	}
	300	}
	301
	302	out:
	303	return t;
	304	}
	305	EXPORT_SYMBOL(gf128mul_init_64k_bbe);
	306
	307	void gf128mul_free_64k(struct gf128mul_64k *t)
	308	{
	309	int i;
	310
	311	for (i = 0; i < 16; i++)
75aa0a7c AC	312	kzfree(t->t[i]);
75aa0a7c AC	313	kzfree(t);
c494e070 RS	314	}
	315	EXPORT_SYMBOL(gf128mul_free_64k);
	316
c494e070 RS	317	void gf128mul_64k_bbe(be128 a, struct gf128mul_64k t)
	318	{
	319	u8 ap = (u8 )a;
	320	be128 r[1];
	321	int i;
	322
	323	*r = t->t[0]->t[ap[15]];
	324	for (i = 1; i < 16; ++i)
	325	be128_xor(r, r, &t->t[i]->t[ap[15 - i]]);
	326	a = r;
	327	}
	328	EXPORT_SYMBOL(gf128mul_64k_bbe);
	329
	330	/* This version uses 4k bytes of table space.
	331	A 16 byte buffer has to be multiplied by a 16 byte key
63be5b53	332	value in GF(2^128). If we consider a GF(2^128) value in a
c494e070 RS	333	single byte, we can construct a table of the 256 16 byte
	334	values that result from the 256 values of this byte.
	335	This requires 4096 bytes. If we take the highest byte in
	336	the buffer and use this table to get the result, we then
	337	have to multiply by x^120 to get the final value. For the
	338	next highest byte the result has to be multiplied by x^112
	339	and so on. But we can do this by accumulating the result
	340	in an accumulator starting with the result for the top
	341	byte. We repeatedly multiply the accumulator value by
	342	x^8 and then add in (i.e. xor) the 16 bytes of the next
	343	lower byte in the buffer, stopping when we reach the
	344	lowest byte. This requires a 4096 byte table.
	345	*/
	346	struct gf128mul_4k gf128mul_init_4k_lle(const be128 g)
	347	{
	348	struct gf128mul_4k *t;
	349	int j, k;
	350
	351	t = kzalloc(sizeof(*t), GFP_KERNEL);
	352	if (!t)
	353	goto out;
	354
	355	t->t[128] = *g;
	356	for (j = 64; j > 0; j >>= 1)
	357	gf128mul_x_lle(&t->t[j], &t->t[j+j]);
	358
	359	for (j = 2; j < 256; j += j)
	360	for (k = 1; k < j; ++k)
	361	be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
	362
	363	out:
	364	return t;
	365	}
	366	EXPORT_SYMBOL(gf128mul_init_4k_lle);
	367
	368	struct gf128mul_4k gf128mul_init_4k_bbe(const be128 g)
	369	{
	370	struct gf128mul_4k *t;
	371	int j, k;
	372
	373	t = kzalloc(sizeof(*t), GFP_KERNEL);
	374	if (!t)
	375	goto out;
	376
	377	t->t[1] = *g;
	378	for (j = 1; j <= 64; j <<= 1)
	379	gf128mul_x_bbe(&t->t[j + j], &t->t[j]);
	380
	381	for (j = 2; j < 256; j += j)
	382	for (k = 1; k < j; ++k)
	383	be128_xor(&t->t[j + k], &t->t[j], &t->t[k]);
	384
	385	out:
	386	return t;
	387	}
	388	EXPORT_SYMBOL(gf128mul_init_4k_bbe);
	389
	390	void gf128mul_4k_lle(be128 a, struct gf128mul_4k t)
	391	{
	392	u8 ap = (u8 )a;
	393	be128 r[1];
	394	int i = 15;
	395
	396	*r = t->t[ap[15]];
397	while (i--) {
398	gf128mul_x8_lle(r);
399	be128_xor(r, r, &t->t[ap[i]]);
400	}
401	a = r;
402	}
403	EXPORT_SYMBOL(gf128mul_4k_lle);
404
405	void gf128mul_4k_bbe(be128 a, struct gf128mul_4k t)
406	{
407	u8 ap = (u8 )a;
408	be128 r[1];
409	int i = 0;
410
411	*r = t->t[ap[0]];
412	while (++i < 16) {
413	gf128mul_x8_bbe(r);
414	be128_xor(r, r, &t->t[ap[i]]);
415	}
416	a = r;
417	}
418	EXPORT_SYMBOL(gf128mul_4k_bbe);
419
420	MODULE_LICENSE("GPL");
421	MODULE_DESCRIPTION("Functions for multiplying elements of GF(2^128)");