diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index 3220b488dd1e043551d08b8031043fbd70c0b517..f100bd958b8896d594d74012273137387e156b57 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -79,7 +79,6 @@
 #include <net/pkt_sched.h>
 #include <linux/rculist.h>
 #include <net/flow_keys.h>
-#include <linux/reciprocal_div.h>
 #include "bonding.h"
 #include "bond_3ad.h"
 #include "bond_alb.h"
@@ -3596,8 +3595,9 @@ static void bond_xmit_slave_id(struct bonding *bond, struct sk_buff *skb, int sl
  */
 static u32 bond_rr_gen_slave_id(struct bonding *bond)
 {
-	int packets_per_slave = bond->params.packets_per_slave;
 	u32 slave_id;
+	struct reciprocal_value reciprocal_packets_per_slave;
+	int packets_per_slave = bond->params.packets_per_slave;
 
 	switch (packets_per_slave) {
 	case 0:
@@ -3607,8 +3607,10 @@ static u32 bond_rr_gen_slave_id(struct bonding *bond)
 		slave_id = bond->rr_tx_counter;
 		break;
 	default:
+		reciprocal_packets_per_slave =
+			bond->params.reciprocal_packets_per_slave;
 		slave_id = reciprocal_divide(bond->rr_tx_counter,
-					     packets_per_slave);
+					     reciprocal_packets_per_slave);
 		break;
 	}
 	bond->rr_tx_counter++;
@@ -4343,10 +4345,18 @@ static int bond_check_params(struct bond_params *params)
 	params->resend_igmp = resend_igmp;
 	params->min_links = min_links;
 	params->lp_interval = lp_interval;
-	if (packets_per_slave > 1)
-		params->packets_per_slave = reciprocal_value(packets_per_slave);
-	else
-		params->packets_per_slave = packets_per_slave;
+	params->packets_per_slave = packets_per_slave;
+	if (packets_per_slave > 0) {
+		params->reciprocal_packets_per_slave =
+			reciprocal_value(packets_per_slave);
+	} else {
+		/* reciprocal_packets_per_slave is unused if
+		 * packets_per_slave is 0 or 1, just initialize it
+		 */
+		params->reciprocal_packets_per_slave =
+			(struct reciprocal_value) { 0 };
+	}
+
 	if (primary) {
 		strncpy(params->primary, primary, IFNAMSIZ);
 		params->primary[IFNAMSIZ - 1] = 0;
diff --git a/drivers/net/bonding/bond_netlink.c b/drivers/net/bonding/bond_netlink.c
index 21c648854a8c4861d7e26bdca97d940f8c28c466..e8526552790cdab3e1c6cfe5cfb6558af2776947 100644
--- a/drivers/net/bonding/bond_netlink.c
+++ b/drivers/net/bonding/bond_netlink.c
@@ -19,7 +19,6 @@
 #include <linux/if_ether.h>
 #include <net/netlink.h>
 #include <net/rtnetlink.h>
-#include <linux/reciprocal_div.h>
 #include "bonding.h"
 
 int bond_get_slave(struct net_device *slave_dev, struct sk_buff *skb)
@@ -452,9 +451,6 @@ static int bond_fill_info(struct sk_buff *skb,
 		goto nla_put_failure;
 
 	packets_per_slave = bond->params.packets_per_slave;
-	if (packets_per_slave > 1)
-		packets_per_slave = reciprocal_value(packets_per_slave);
-
 	if (nla_put_u32(skb, IFLA_BOND_PACKETS_PER_SLAVE,
 			packets_per_slave))
 		goto nla_put_failure;
diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
index 945a6668da8376ad125356a300b9d34a72ea4166..85e434886f2e116a8b3ec13962dd70ba0ea884b4 100644
--- a/drivers/net/bonding/bond_options.c
+++ b/drivers/net/bonding/bond_options.c
@@ -16,7 +16,6 @@
 #include <linux/netdevice.h>
 #include <linux/rwlock.h>
 #include <linux/rcupdate.h>
-#include <linux/reciprocal_div.h>
 #include "bonding.h"
 
 int bond_option_mode_set(struct bonding *bond, int mode)
@@ -671,11 +670,17 @@ int bond_option_packets_per_slave_set(struct bonding *bond,
 		pr_warn("%s: Warning: packets_per_slave has effect only in balance-rr mode\n",
 			bond->dev->name);
 
-	if (packets_per_slave > 1)
-		bond->params.packets_per_slave =
+	bond->params.packets_per_slave = packets_per_slave;
+	if (packets_per_slave > 0) {
+		bond->params.reciprocal_packets_per_slave =
 			reciprocal_value(packets_per_slave);
-	else
-		bond->params.packets_per_slave = packets_per_slave;
+	} else {
+		/* reciprocal_packets_per_slave is unused if
+		 * packets_per_slave is 0 or 1, just initialize it
+		 */
+		bond->params.reciprocal_packets_per_slave =
+			(struct reciprocal_value) { 0 };
+	}
 
 	return 0;
 }
diff --git a/drivers/net/bonding/bond_sysfs.c b/drivers/net/bonding/bond_sysfs.c
index 011f163c2c6771a9cbd335a507dfb390b081eca2..c083e9a66ece736493295ba6044ea2377b446d11 100644
--- a/drivers/net/bonding/bond_sysfs.c
+++ b/drivers/net/bonding/bond_sysfs.c
@@ -39,7 +39,6 @@
 #include <net/net_namespace.h>
 #include <net/netns/generic.h>
 #include <linux/nsproxy.h>
-#include <linux/reciprocal_div.h>
 
 #include "bonding.h"
 
@@ -1374,10 +1373,6 @@ static ssize_t bonding_show_packets_per_slave(struct device *d,
 {
 	struct bonding *bond = to_bond(d);
 	unsigned int packets_per_slave = bond->params.packets_per_slave;
-
-	if (packets_per_slave > 1)
-		packets_per_slave = reciprocal_value(packets_per_slave);
-
 	return sprintf(buf, "%u\n", packets_per_slave);
 }
 
diff --git a/drivers/net/bonding/bonding.h b/drivers/net/bonding/bonding.h
index 8a935f8f2b3c558ec7d288149ced77e3c60cc281..0a616c41dc9496c3cce31f6890ba3eeb992a96d5 100644
--- a/drivers/net/bonding/bonding.h
+++ b/drivers/net/bonding/bonding.h
@@ -23,6 +23,8 @@
 #include <linux/netpoll.h>
 #include <linux/inetdevice.h>
 #include <linux/etherdevice.h>
+#include <linux/reciprocal_div.h>
+
 #include "bond_3ad.h"
 #include "bond_alb.h"
 
@@ -171,6 +173,7 @@ struct bond_params {
 	int resend_igmp;
 	int lp_interval;
 	int packets_per_slave;
+	struct reciprocal_value reciprocal_packets_per_slave;
 };
 
 struct bond_parm_tbl {
diff --git a/include/linux/flex_array.h b/include/linux/flex_array.h
index 6843cf193a445134cde17f7d2db6893399ac8cda..b6efb0c64408b35b7874726ca22c72f2e5fe21f3 100644
--- a/include/linux/flex_array.h
+++ b/include/linux/flex_array.h
@@ -2,6 +2,7 @@
 #define _FLEX_ARRAY_H
 
 #include <linux/types.h>
+#include <linux/reciprocal_div.h>
 #include <asm/page.h>
 
 #define FLEX_ARRAY_PART_SIZE PAGE_SIZE
@@ -22,7 +23,7 @@ struct flex_array {
 			int element_size;
 			int total_nr_elements;
 			int elems_per_part;
-			u32 reciprocal_elems;
+			struct reciprocal_value reciprocal_elems;
 			struct flex_array_part *parts[];
 		};
 		/*
diff --git a/include/linux/reciprocal_div.h b/include/linux/reciprocal_div.h
index f9c90b33285b24b2ef9870ee3d3fe3cf78f9283d..8c5a3fb6c6c57821720dc307f3eeec944c56f974 100644
--- a/include/linux/reciprocal_div.h
+++ b/include/linux/reciprocal_div.h
@@ -4,29 +4,32 @@
 #include <linux/types.h>
 
 /*
- * This file describes reciprocical division.
+ * This algorithm is based on the paper "Division by Invariant
+ * Integers Using Multiplication" by Torbjörn Granlund and Peter
+ * L. Montgomery.
  *
- * This optimizes the (A/B) problem, when A and B are two u32
- * and B is a known value (but not known at compile time)
+ * The assembler implementation from Agner Fog, which this code is
+ * based on, can be found here:
+ * http://www.agner.org/optimize/asmlib.zip
  *
- * The math principle used is :
- *   Let RECIPROCAL_VALUE(B) be (((1LL << 32) + (B - 1))/ B)
- *   Then A / B = (u32)(((u64)(A) * (R)) >> 32)
- *
- * This replaces a divide by a multiply (and a shift), and
- * is generally less expensive in CPU cycles.
+ * This optimization for A/B is helpful if the divisor B is mostly
+ * runtime invariant. The reciprocal of B is calculated in the
+ * slow-path with reciprocal_value(). The fast-path can then just use
+ * a much faster multiplication operation with a variable dividend A
+ * to calculate the division A/B.
  */
 
-/*
- * Computes the reciprocal value (R) for the value B of the divisor.
- * Should not be called before each reciprocal_divide(),
- * or else the performance is slower than a normal divide.
- */
-extern u32 reciprocal_value(u32 B);
+struct reciprocal_value {
+	u32 m;
+	u8 sh1, sh2;
+};
 
+struct reciprocal_value reciprocal_value(u32 d);
 
-static inline u32 reciprocal_divide(u32 A, u32 R)
+static inline u32 reciprocal_divide(u32 a, struct reciprocal_value R)
 {
-	return (u32)(((u64)A * R) >> 32);
+	u32 t = (u32)(((u64)a * R.m) >> 32);
+	return (t + ((a - t) >> R.sh1)) >> R.sh2;
 }
-#endif
+
+#endif /* _LINUX_RECIPROCAL_DIV_H */
diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h
index 09bfffb08a56db285caa27146202f04e2188b480..96e8abae19a943375e39da1c779016a72cc07b7e 100644
--- a/include/linux/slab_def.h
+++ b/include/linux/slab_def.h
@@ -1,6 +1,8 @@
 #ifndef _LINUX_SLAB_DEF_H
 #define	_LINUX_SLAB_DEF_H
 
+#include <linux/reciprocal_div.h>
+
 /*
  * Definitions unique to the original Linux SLAB allocator.
  */
@@ -12,7 +14,7 @@ struct kmem_cache {
 	unsigned int shared;
 
 	unsigned int size;
-	u32 reciprocal_buffer_size;
+	struct reciprocal_value reciprocal_buffer_size;
 /* 2) touched by every alloc & free from the backend */
 
 	unsigned int flags;		/* constant flags */
diff --git a/include/net/red.h b/include/net/red.h
index 168bb2f495f207511bffe485c1070ea8bb6df30a..76e0b5f922c6c475e7f47a73d072282dfec53149 100644
--- a/include/net/red.h
+++ b/include/net/red.h
@@ -130,7 +130,8 @@ struct red_parms {
 	u32		qth_max;	/* Max avg length threshold: Wlog scaled */
 	u32		Scell_max;
 	u32		max_P;		/* probability, [0 .. 1.0] 32 scaled */
-	u32		max_P_reciprocal; /* reciprocal_value(max_P / qth_delta) */
+	/* reciprocal_value(max_P / qth_delta) */
+	struct reciprocal_value	max_P_reciprocal;
 	u32		qth_delta;	/* max_th - min_th */
 	u32		target_min;	/* min_th + 0.4*(max_th - min_th) */
 	u32		target_max;	/* min_th + 0.6*(max_th - min_th) */
diff --git a/lib/flex_array.c b/lib/flex_array.c
index 6948a6692fc4f6b70c2bfc0700afb7a971d7059e..2eed22fa507c7cb0756d7ef643f8a3454eb455ec 100644
--- a/lib/flex_array.c
+++ b/lib/flex_array.c
@@ -90,8 +90,8 @@ struct flex_array *flex_array_alloc(int element_size, unsigned int total,
 {
 	struct flex_array *ret;
 	int elems_per_part = 0;
-	int reciprocal_elems = 0;
 	int max_size = 0;
+	struct reciprocal_value reciprocal_elems = { 0 };
 
 	if (element_size) {
 		elems_per_part = FLEX_ARRAY_ELEMENTS_PER_PART(element_size);
@@ -119,6 +119,11 @@ EXPORT_SYMBOL(flex_array_alloc);
 static int fa_element_to_part_nr(struct flex_array *fa,
 					unsigned int element_nr)
 {
+	/*
+	 * if element_size == 0 we don't get here, so we never touch
+	 * the zeroed fa->reciprocal_elems, which would yield invalid
+	 * results
+	 */
 	return reciprocal_divide(element_nr, fa->reciprocal_elems);
 }
 
diff --git a/lib/reciprocal_div.c b/lib/reciprocal_div.c
index 75510e94f7d0b2fcd5736a4e0aeb9ff282da11d1..464152410c5123e191324a4cfb3ca5ea8ff66687 100644
--- a/lib/reciprocal_div.c
+++ b/lib/reciprocal_div.c
@@ -1,11 +1,27 @@
+#include <linux/kernel.h>
 #include <asm/div64.h>
 #include <linux/reciprocal_div.h>
 #include <linux/export.h>
 
-u32 reciprocal_value(u32 k)
+/*
+ * For a description of the algorithm please have a look at
+ * include/linux/reciprocal_div.h
+ */
+
+struct reciprocal_value reciprocal_value(u32 d)
 {
-	u64 val = (1LL << 32) + (k - 1);
-	do_div(val, k);
-	return (u32)val;
+	struct reciprocal_value R;
+	u64 m;
+	int l;
+
+	l = fls(d - 1);
+	m = ((1ULL << 32) * ((1ULL << l) - d));
+	do_div(m, d);
+	++m;
+	R.m = (u32)m;
+	R.sh1 = min(l, 1);
+	R.sh2 = max(l - 1, 0);
+
+	return R;
 }
 EXPORT_SYMBOL(reciprocal_value);
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index a2bfc371b44a3339709be67d58004a00400abe8e..de1059af6da14c563115147852c4f52b80a9387d 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -91,7 +91,7 @@ struct netem_sched_data {
 	u64 rate;
 	s32 packet_overhead;
 	u32 cell_size;
-	u32 cell_size_reciprocal;
+	struct reciprocal_value cell_size_reciprocal;
 	s32 cell_overhead;
 
 	struct crndstate {
@@ -725,9 +725,11 @@ static void get_rate(struct Qdisc *sch, const struct nlattr *attr)
 	q->rate = r->rate;
 	q->packet_overhead = r->packet_overhead;
 	q->cell_size = r->cell_size;
+	q->cell_overhead = r->cell_overhead;
 	if (q->cell_size)
 		q->cell_size_reciprocal = reciprocal_value(q->cell_size);
-	q->cell_overhead = r->cell_overhead;
+	else
+		q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
 }
 
 static int get_loss_clg(struct Qdisc *sch, const struct nlattr *attr)