diff --git a/include/net/pkt_cls.h b/include/net/pkt_cls.h
index e80edd8879efc70f96dc0fbe91714c48714edc54..3009547f3c66347b38be87831ec17d7aa7a149ed 100644
--- a/include/net/pkt_cls.h
+++ b/include/net/pkt_cls.h
@@ -2,6 +2,7 @@
 #define __NET_PKT_CLS_H
 
 #include <linux/pkt_cls.h>
+#include <linux/workqueue.h>
 #include <net/sch_generic.h>
 #include <net/act_api.h>
 
@@ -17,6 +18,8 @@ struct tcf_walker {
 int register_tcf_proto_ops(struct tcf_proto_ops *ops);
 int unregister_tcf_proto_ops(struct tcf_proto_ops *ops);
 
+bool tcf_queue_work(struct work_struct *work);
+
 #ifdef CONFIG_NET_CLS
 struct tcf_chain *tcf_chain_get(struct tcf_block *block, u32 chain_index,
 				bool create);
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index 135f5a2dd93122dd905557028068a31aeea37cb0..0dec8a23be574cd54e4dab927f6df5c7879c33aa 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -10,6 +10,7 @@
 #include <linux/dynamic_queue_limits.h>
 #include <linux/list.h>
 #include <linux/refcount.h>
+#include <linux/workqueue.h>
 #include <net/gen_stats.h>
 #include <net/rtnetlink.h>
 
@@ -271,6 +272,7 @@ struct tcf_chain {
 
 struct tcf_block {
 	struct list_head chain_list;
+	struct work_struct work;
 };
 
 static inline void qdisc_cb_private_validate(const struct sk_buff *skb, int sz)
diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
index 0b2219adf520b33a471dd3b8e2fdf9f46271c3f9..045d13679ad69b12fdec83f8ed2871ad0f8dd527 100644
--- a/net/sched/cls_api.c
+++ b/net/sched/cls_api.c
@@ -77,6 +77,8 @@ int register_tcf_proto_ops(struct tcf_proto_ops *ops)
 }
 EXPORT_SYMBOL(register_tcf_proto_ops);
 
+static struct workqueue_struct *tc_filter_wq;
+
 int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
 {
 	struct tcf_proto_ops *t;
@@ -86,6 +88,7 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
 	 * tcf_proto_ops's destroy() handler.
 	 */
 	rcu_barrier();
+	flush_workqueue(tc_filter_wq);
 
 	write_lock(&cls_mod_lock);
 	list_for_each_entry(t, &tcf_proto_base, head) {
@@ -100,6 +103,12 @@ int unregister_tcf_proto_ops(struct tcf_proto_ops *ops)
 }
 EXPORT_SYMBOL(unregister_tcf_proto_ops);
 
+bool tcf_queue_work(struct work_struct *work)
+{
+	return queue_work(tc_filter_wq, work);
+}
+EXPORT_SYMBOL(tcf_queue_work);
+
 /* Select new prio value from the range, managed by kernel. */
 
 static inline u32 tcf_auto_prio(struct tcf_proto *tp)
@@ -266,23 +275,30 @@ int tcf_block_get(struct tcf_block **p_block,
 }
 EXPORT_SYMBOL(tcf_block_get);
 
-void tcf_block_put(struct tcf_block *block)
+static void tcf_block_put_final(struct work_struct *work)
 {
+	struct tcf_block *block = container_of(work, struct tcf_block, work);
 	struct tcf_chain *chain, *tmp;
 
-	if (!block)
-		return;
-
-	/* XXX: Standalone actions are not allowed to jump to any chain, and
-	 * bound actions should be all removed after flushing. However,
-	 * filters are destroyed in RCU callbacks, we have to hold the chains
-	 * first, otherwise we would always race with RCU callbacks on this list
-	 * without proper locking.
-	 */
+	/* At this point, all the chains should have refcnt == 1. */
+	rtnl_lock();
+	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
+		tcf_chain_put(chain);
+	rtnl_unlock();
+	kfree(block);
+}
 
-	/* Wait for existing RCU callbacks to cool down. */
-	rcu_barrier();
+/* XXX: Standalone actions are not allowed to jump to any chain, and bound
+ * actions should be all removed after flushing. However, filters are destroyed
+ * in RCU callbacks, we have to hold the chains first, otherwise we would
+ * always race with RCU callbacks on this list without proper locking.
+ */
+static void tcf_block_put_deferred(struct work_struct *work)
+{
+	struct tcf_block *block = container_of(work, struct tcf_block, work);
+	struct tcf_chain *chain;
 
+	rtnl_lock();
 	/* Hold a refcnt for all chains, except 0, in case they are gone. */
 	list_for_each_entry(chain, &block->chain_list, list)
 		if (chain->index)
@@ -292,13 +308,27 @@ void tcf_block_put(struct tcf_block *block)
 	list_for_each_entry(chain, &block->chain_list, list)
 		tcf_chain_flush(chain);
 
-	/* Wait for RCU callbacks to release the reference count. */
+	INIT_WORK(&block->work, tcf_block_put_final);
+	/* Wait for RCU callbacks to release the reference count and make
+	 * sure their works have been queued before this.
+	 */
 	rcu_barrier();
+	tcf_queue_work(&block->work);
+	rtnl_unlock();
+}
 
-	/* At this point, all the chains should have refcnt == 1. */
-	list_for_each_entry_safe(chain, tmp, &block->chain_list, list)
-		tcf_chain_put(chain);
-	kfree(block);
+void tcf_block_put(struct tcf_block *block)
+{
+	if (!block)
+		return;
+
+	INIT_WORK(&block->work, tcf_block_put_deferred);
+	/* Wait for existing RCU callbacks to cool down, make sure their works
+	 * have been queued before this. We can not flush pending works here
+	 * because we are holding the RTNL lock.
+	 */
+	rcu_barrier();
+	tcf_queue_work(&block->work);
 }
 EXPORT_SYMBOL(tcf_block_put);
 
@@ -1030,6 +1060,10 @@ EXPORT_SYMBOL(tcf_exts_get_dev);
 
 static int __init tc_filter_init(void)
 {
+	tc_filter_wq = alloc_ordered_workqueue("tc_filter_workqueue", 0);
+	if (!tc_filter_wq)
+		return -ENOMEM;
+
 	rtnl_register(PF_UNSPEC, RTM_NEWTFILTER, tc_ctl_tfilter, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_DELTFILTER, tc_ctl_tfilter, NULL, 0);
 	rtnl_register(PF_UNSPEC, RTM_GETTFILTER, tc_ctl_tfilter,