[linux-2.6-block.git] / include / linux / reciprocal_div.h

/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _LINUX_RECIPROCAL_DIV_H
#define _LINUX_RECIPROCAL_DIV_H

#include <linux/types.h>

/*
 * This algorithm is based on the paper "Division by Invariant
 * Integers Using Multiplication" by Torbjörn Granlund and Peter
 * L. Montgomery.
 *
 * The assembler implementation from Agner Fog, which this code is
 * based on, can be found here:
 * http://www.agner.org/optimize/asmlib.zip
 *
 * This optimization for A/B is helpful if the divisor B is mostly
 * runtime invariant. The reciprocal of B is calculated in the
 * slow-path with reciprocal_value(). The fast-path can then just use
 * a much faster multiplication operation with a variable dividend A
 * to calculate the division A/B.
 */

struct reciprocal_value {
	u32 m;
	u8 sh1, sh2;
};

/* "reciprocal_value" and "reciprocal_divide" together implement the basic
 * version of the algorithm described in Figure 4.1 of the paper.
 */
struct reciprocal_value reciprocal_value(u32 d);

static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
{
	u32 t = (u32)(((u64)a * R.m) >> 32);
	return (t + ((a - t) >> R.sh1)) >> R.sh2;
}

struct reciprocal_value_adv {
	u32 m;
	u8 sh, exp;
	bool is_wide_m;
};

/* "reciprocal_value_adv" implements the advanced version of the algorithm
 * described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose
 * ceil(log2(d)) result will be 32 which then requires u128 divide on host. The
 * exception case could be easily handled before calling "reciprocal_value_adv".
 *
 * The advanced version requires more complex calculation to get the reciprocal
 * multiplier and other control variables, but then could reduce the required
 * emulation operations.
 *
 * It makes no sense to use this advanced version for host divide emulation,
 * those extra complexities for calculating multiplier etc could completely
 * waive our saving on emulation operations.
 *
 * However, it makes sense to use it for JIT divide code generation for which
 * we are willing to trade performance of JITed code with that of host. As shown
 * by the following pseudo code, the required emulation operations could go down
 * from 6 (the basic version) to 3 or 4.
 *
 * To use the result of "reciprocal_value_adv", suppose we want to calculate
 * n/d, the pseudo C code will be:
 *
 *   struct reciprocal_value_adv rvalue;
 *   u8 pre_shift, exp;
 *
 *   // handle exception case.
 *   if (d >= (1U << 31)) {
 *     result = n >= d;
 *     return;
 *   }
 *
 *   rvalue = reciprocal_value_adv(d, 32)
 *   exp = rvalue.exp;
 *   if (rvalue.is_wide_m && !(d & 1)) {
 *     // floor(log2(d & (2^32 -d)))
 *     pre_shift = fls(d & -d) - 1;
 *     rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
 *   } else {
 *     pre_shift = 0;
 *   }
 *
 *   // code generation starts.
 *   if (imm == 1U << exp) {
 *     result = n >> exp;
 *   } else if (rvalue.is_wide_m) {
 *     // pre_shift must be zero when reached here.
 *     t = (n * rvalue.m) >> 32;
 *     result = n - t;
 *     result >>= 1;
 *     result += t;
 *     result >>= rvalue.sh - 1;
 *   } else {
 *     if (pre_shift)
 *       result = n >> pre_shift;
 *     result = ((u64)result * rvalue.m) >> 32;
 *     result >>= rvalue.sh;
 *   }
 */
struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);

#endif /* _LINUX_RECIPROCAL_DIV_H */
Commit	Line	Data
b2441318	1	/* SPDX-License-Identifier: GPL-2.0 */
6a2d7a95 ED	2	#ifndef _LINUX_RECIPROCAL_DIV_H
	3	#define _LINUX_RECIPROCAL_DIV_H
	4
	5	#include <linux/types.h>
	6
	7	/*
809fa972 HFS	8	* This algorithm is based on the paper "Division by Invariant
	9	* Integers Using Multiplication" by Torbjörn Granlund and Peter
	10	* L. Montgomery.
6a2d7a95	11	*
809fa972 HFS	12	* The assembler implementation from Agner Fog, which this code is
	13	* based on, can be found here:
	14	* http://www.agner.org/optimize/asmlib.zip
6a2d7a95	15	*
809fa972 HFS	16	* This optimization for A/B is helpful if the divisor B is mostly
	17	* runtime invariant. The reciprocal of B is calculated in the
	18	* slow-path with reciprocal_value(). The fast-path can then just use
	19	* a much faster multiplication operation with a variable dividend A
	20	* to calculate the division A/B.
6a2d7a95 ED	21	*/
6a2d7a95 ED	22
809fa972 HFS	23	struct reciprocal_value {
	24	u32 m;
	25	u8 sh1, sh2;
	26	};
6a2d7a95	27
06ae4826 JW	28	/* "reciprocal_value" and "reciprocal_divide" together implement the basic
	29	* version of the algorithm described in Figure 4.1 of the paper.
	30	*/
809fa972	31	struct reciprocal_value reciprocal_value(u32 d);
6a2d7a95	32
809fa972	33	static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
6a2d7a95	34	{
809fa972 HFS	35	u32 t = (u32)(((u64)a * R.m) >> 32);
809fa972 HFS	36	return (t + ((a - t) >> R.sh1)) >> R.sh2;
6a2d7a95	37	}
809fa972	38
06ae4826 JW	39	struct reciprocal_value_adv {
	40	u32 m;
	41	u8 sh, exp;
	42	bool is_wide_m;
	43	};
	44
	45	/* "reciprocal_value_adv" implements the advanced version of the algorithm
	46	* described in Figure 4.2 of the paper except when "divisor > (1U << 31)" whose
	47	* ceil(log2(d)) result will be 32 which then requires u128 divide on host. The
	48	* exception case could be easily handled before calling "reciprocal_value_adv".
	49	*
	50	* The advanced version requires more complex calculation to get the reciprocal
	51	* multiplier and other control variables, but then could reduce the required
	52	* emulation operations.
	53	*
	54	* It makes no sense to use this advanced version for host divide emulation,
	55	* those extra complexities for calculating multiplier etc could completely
	56	* waive our saving on emulation operations.
	57	*
	58	* However, it makes sense to use it for JIT divide code generation for which
	59	* we are willing to trade performance of JITed code with that of host. As shown
	60	* by the following pseudo code, the required emulation operations could go down
	61	* from 6 (the basic version) to 3 or 4.
	62	*
	63	* To use the result of "reciprocal_value_adv", suppose we want to calculate
	64	* n/d, the pseudo C code will be:
	65	*
	66	* struct reciprocal_value_adv rvalue;
	67	* u8 pre_shift, exp;
	68	*
	69	* // handle exception case.
	70	* if (d >= (1U << 31)) {
	71	* result = n >= d;
	72	* return;
	73	* }
	74	*
	75	* rvalue = reciprocal_value_adv(d, 32)
	76	* exp = rvalue.exp;
	77	* if (rvalue.is_wide_m && !(d & 1)) {
	78	* // floor(log2(d & (2^32 -d)))
	79	* pre_shift = fls(d & -d) - 1;
	80	* rvalue = reciprocal_value_adv(d >> pre_shift, 32 - pre_shift);
	81	* } else {
	82	* pre_shift = 0;
	83	* }
	84	*
	85	* // code generation starts.
	86	* if (imm == 1U << exp) {
	87	* result = n >> exp;
	88	* } else if (rvalue.is_wide_m) {
	89	* // pre_shift must be zero when reached here.
	90	* t = (n * rvalue.m) >> 32;
	91	* result = n - t;
	92	* result >>= 1;
	93	* result += t;
	94	* result >>= rvalue.sh - 1;
	95	* } else {
	96	* if (pre_shift)
	97	* result = n >> pre_shift;
	98	* result = ((u64)result * rvalue.m) >> 32;
	99	* result >>= rvalue.sh;
	100	* }
	101	*/
	102	struct reciprocal_value_adv reciprocal_value_adv(u32 d, u8 prec);
103
809fa972	104	#endif /* _LINUX_RECIPROCAL_DIV_H */