diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index 5abcbdc743fa255064488f199c3a881bf2ee53d6..086dcbadce0971a43cd099fa6e0bbcd23bcc17e1 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1256,9 +1256,9 @@ void clean_tree_block(struct btrfs_fs_info *fs_info,
 		btrfs_assert_tree_locked(buf);
 
 		if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)) {
-			__percpu_counter_add(&fs_info->dirty_metadata_bytes,
-					     -buf->len,
-					     fs_info->dirty_metadata_batch);
+			percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+						 -buf->len,
+						 fs_info->dirty_metadata_batch);
 			/* ugh, clear_extent_buffer_dirty needs to lock the page */
 			btrfs_set_lock_blocking(buf);
 			clear_extent_buffer_dirty(buf);
@@ -4047,9 +4047,9 @@ void btrfs_mark_buffer_dirty(struct extent_buffer *buf)
 			buf->start, transid, fs_info->generation);
 	was_dirty = set_extent_buffer_dirty(buf);
 	if (!was_dirty)
-		__percpu_counter_add(&fs_info->dirty_metadata_bytes,
-				     buf->len,
-				     fs_info->dirty_metadata_batch);
+		percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+					 buf->len,
+					 fs_info->dirty_metadata_batch);
 #ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
 	if (btrfs_header_level(buf) == 0 && check_leaf(root, buf)) {
 		btrfs_print_leaf(fs_info, buf);
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index 7a18b5762ac903c2a3a9ffb05c87614a6c6b1e1a..556484cf5d9338d3383a23de7d484df2b5348d73 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -3577,9 +3577,9 @@ lock_extent_buffer_for_io(struct extent_buffer *eb,
 		set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags);
 		spin_unlock(&eb->refs_lock);
 		btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN);
-		__percpu_counter_add(&fs_info->dirty_metadata_bytes,
-				     -eb->len,
-				     fs_info->dirty_metadata_batch);
+		percpu_counter_add_batch(&fs_info->dirty_metadata_bytes,
+					 -eb->len,
+					 fs_info->dirty_metadata_batch);
 		ret = 1;
 	} else {
 		spin_unlock(&eb->refs_lock);
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 8d050314591cd38bf4f40a212b0bc613b2773779..06dea7c89bbde5866664294a8d639b13d6dcf7bb 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -1763,8 +1763,8 @@ static void btrfs_set_bit_hook(void *private_data,
 		if (btrfs_is_testing(fs_info))
 			return;
 
-		__percpu_counter_add(&fs_info->delalloc_bytes, len,
-				     fs_info->delalloc_batch);
+		percpu_counter_add_batch(&fs_info->delalloc_bytes, len,
+					 fs_info->delalloc_batch);
 		spin_lock(&BTRFS_I(inode)->lock);
 		BTRFS_I(inode)->delalloc_bytes += len;
 		if (*bits & EXTENT_DEFRAG)
@@ -1838,8 +1838,8 @@ static void btrfs_clear_bit_hook(void *private_data,
 					&inode->vfs_inode,
 					state->start, len);
 
-		__percpu_counter_add(&fs_info->delalloc_bytes, -len,
-				     fs_info->delalloc_batch);
+		percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
+					 fs_info->delalloc_batch);
 		spin_lock(&inode->lock);
 		inode->delalloc_bytes -= len;
 		if (do_list && inode->delalloc_bytes == 0 &&
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index d249546da15ef0b2e847f8a2a6c354c94d4600b6..43d07f9c4e9ed1fbca5b25bdc021b7900fe18f07 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -1211,7 +1211,7 @@ xfs_mod_icount(
 	struct xfs_mount	*mp,
 	int64_t			delta)
 {
-	__percpu_counter_add(&mp->m_icount, delta, XFS_ICOUNT_BATCH);
+	percpu_counter_add_batch(&mp->m_icount, delta, XFS_ICOUNT_BATCH);
 	if (__percpu_counter_compare(&mp->m_icount, 0, XFS_ICOUNT_BATCH) < 0) {
 		ASSERT(0);
 		percpu_counter_add(&mp->m_icount, -delta);
@@ -1290,7 +1290,7 @@ xfs_mod_fdblocks(
 	else
 		batch = XFS_FDBLOCKS_BATCH;
 
-	__percpu_counter_add(&mp->m_fdblocks, delta, batch);
+	percpu_counter_add_batch(&mp->m_fdblocks, delta, batch);
 	if (__percpu_counter_compare(&mp->m_fdblocks, mp->m_alloc_set_aside,
 				     XFS_FDBLOCKS_BATCH) >= 0) {
 		/* we had space! */
diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h
index 557d84063934c65c2aa96c2b6141425af8a567bf..ace73f96eb1eef15970430d22ab7644469823f29 100644
--- a/include/linux/backing-dev.h
+++ b/include/linux/backing-dev.h
@@ -66,7 +66,7 @@ static inline bool bdi_has_dirty_io(struct backing_dev_info *bdi)
 static inline void __add_wb_stat(struct bdi_writeback *wb,
 				 enum wb_stat_item item, s64 amount)
 {
-	__percpu_counter_add(&wb->stat[item], amount, WB_STAT_BATCH);
+	percpu_counter_add_batch(&wb->stat[item], amount, WB_STAT_BATCH);
 }
 
 static inline void __inc_wb_stat(struct bdi_writeback *wb,
diff --git a/include/linux/blk-cgroup.h b/include/linux/blk-cgroup.h
index 01b62e7bac74bbc792d19255bacea90ae5b52abf..7104bea8dab1c9895ccc916c5d90912fb41db77e 100644
--- a/include/linux/blk-cgroup.h
+++ b/include/linux/blk-cgroup.h
@@ -518,7 +518,7 @@ static inline void blkg_stat_exit(struct blkg_stat *stat)
  */
 static inline void blkg_stat_add(struct blkg_stat *stat, uint64_t val)
 {
-	__percpu_counter_add(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
+	percpu_counter_add_batch(&stat->cpu_cnt, val, BLKG_STAT_CPU_BATCH);
 }
 
 /**
@@ -597,14 +597,14 @@ static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
 	else
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
 
-	__percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
+	percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
 
 	if (op_is_sync(op))
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
 	else
 		cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
 
-	__percpu_counter_add(cnt, val, BLKG_STAT_CPU_BATCH);
+	percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
 }
 
 /**
diff --git a/include/linux/mman.h b/include/linux/mman.h
index 634c4c51fe3adaee4b65d9a977e954f42ecf6131..c8367041fafde6e761044e7d77fb25736e58a863 100644
--- a/include/linux/mman.h
+++ b/include/linux/mman.h
@@ -22,7 +22,7 @@ unsigned long vm_memory_committed(void);
 
 static inline void vm_acct_memory(long pages)
 {
-	__percpu_counter_add(&vm_committed_as, pages, vm_committed_as_batch);
+	percpu_counter_add_batch(&vm_committed_as, pages, vm_committed_as_batch);
 }
 
 static inline void vm_unacct_memory(long pages)
diff --git a/include/linux/percpu_counter.h b/include/linux/percpu_counter.h
index 84a1094496100906c1b89714f921451a00babb9f..ec065387f44307852a512197a0ebfaa3c45dbbf6 100644
--- a/include/linux/percpu_counter.h
+++ b/include/linux/percpu_counter.h
@@ -39,7 +39,8 @@ int __percpu_counter_init(struct percpu_counter *fbc, s64 amount, gfp_t gfp,
 
 void percpu_counter_destroy(struct percpu_counter *fbc);
 void percpu_counter_set(struct percpu_counter *fbc, s64 amount);
-void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch);
+void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount,
+			      s32 batch);
 s64 __percpu_counter_sum(struct percpu_counter *fbc);
 int __percpu_counter_compare(struct percpu_counter *fbc, s64 rhs, s32 batch);
 
@@ -50,7 +51,7 @@ static inline int percpu_counter_compare(struct percpu_counter *fbc, s64 rhs)
 
 static inline void percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 {
-	__percpu_counter_add(fbc, amount, percpu_counter_batch);
+	percpu_counter_add_batch(fbc, amount, percpu_counter_batch);
 }
 
 static inline s64 percpu_counter_sum_positive(struct percpu_counter *fbc)
@@ -136,7 +137,7 @@ percpu_counter_add(struct percpu_counter *fbc, s64 amount)
 }
 
 static inline void
-__percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
+percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
 {
 	percpu_counter_add(fbc, amount);
 }
diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h
index 440c1e9d062392ba10f1ea39ec7a1b9f4e46fb0f..6fdcd242777651aa75bb55ecfcb9fce53708fd53 100644
--- a/include/net/inet_frag.h
+++ b/include/net/inet_frag.h
@@ -154,12 +154,12 @@ static inline int frag_mem_limit(struct netns_frags *nf)
 
 static inline void sub_frag_mem_limit(struct netns_frags *nf, int i)
 {
-	__percpu_counter_add(&nf->mem, -i, frag_percpu_counter_batch);
+	percpu_counter_add_batch(&nf->mem, -i, frag_percpu_counter_batch);
 }
 
 static inline void add_frag_mem_limit(struct netns_frags *nf, int i)
 {
-	__percpu_counter_add(&nf->mem, i, frag_percpu_counter_batch);
+	percpu_counter_add_batch(&nf->mem, i, frag_percpu_counter_batch);
 }
 
 static inline unsigned int sum_frag_mem_limit(struct netns_frags *nf)
diff --git a/include/trace/events/percpu.h b/include/trace/events/percpu.h
new file mode 100644
index 0000000000000000000000000000000000000000..ad34b1bae04751eff139ccade9868338963e0803
--- /dev/null
+++ b/include/trace/events/percpu.h
@@ -0,0 +1,125 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM percpu
+
+#if !defined(_TRACE_PERCPU_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_PERCPU_H
+
+#include <linux/tracepoint.h>
+
+TRACE_EVENT(percpu_alloc_percpu,
+
+	TP_PROTO(bool reserved, bool is_atomic, size_t size,
+		 size_t align, void *base_addr, int off, void __percpu *ptr),
+
+	TP_ARGS(reserved, is_atomic, size, align, base_addr, off, ptr),
+
+	TP_STRUCT__entry(
+		__field(	bool,			reserved	)
+		__field(	bool,			is_atomic	)
+		__field(	size_t,			size		)
+		__field(	size_t,			align		)
+		__field(	void *,			base_addr	)
+		__field(	int,			off		)
+		__field(	void __percpu *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->reserved	= reserved;
+		__entry->is_atomic	= is_atomic;
+		__entry->size		= size;
+		__entry->align		= align;
+		__entry->base_addr	= base_addr;
+		__entry->off		= off;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu base_addr=%p off=%d ptr=%p",
+		  __entry->reserved, __entry->is_atomic,
+		  __entry->size, __entry->align,
+		  __entry->base_addr, __entry->off, __entry->ptr)
+);
+
+TRACE_EVENT(percpu_free_percpu,
+
+	TP_PROTO(void *base_addr, int off, void __percpu *ptr),
+
+	TP_ARGS(base_addr, off, ptr),
+
+	TP_STRUCT__entry(
+		__field(	void *,			base_addr	)
+		__field(	int,			off		)
+		__field(	void __percpu *,	ptr		)
+	),
+
+	TP_fast_assign(
+		__entry->base_addr	= base_addr;
+		__entry->off		= off;
+		__entry->ptr		= ptr;
+	),
+
+	TP_printk("base_addr=%p off=%d ptr=%p",
+		__entry->base_addr, __entry->off, __entry->ptr)
+);
+
+TRACE_EVENT(percpu_alloc_percpu_fail,
+
+	TP_PROTO(bool reserved, bool is_atomic, size_t size, size_t align),
+
+	TP_ARGS(reserved, is_atomic, size, align),
+
+	TP_STRUCT__entry(
+		__field(	bool,	reserved	)
+		__field(	bool,	is_atomic	)
+		__field(	size_t,	size		)
+		__field(	size_t, align		)
+	),
+
+	TP_fast_assign(
+		__entry->reserved	= reserved;
+		__entry->is_atomic	= is_atomic;
+		__entry->size		= size;
+		__entry->align		= align;
+	),
+
+	TP_printk("reserved=%d is_atomic=%d size=%zu align=%zu",
+		  __entry->reserved, __entry->is_atomic,
+		  __entry->size, __entry->align)
+);
+
+TRACE_EVENT(percpu_create_chunk,
+
+	TP_PROTO(void *base_addr),
+
+	TP_ARGS(base_addr),
+
+	TP_STRUCT__entry(
+		__field(	void *, base_addr	)
+	),
+
+	TP_fast_assign(
+		__entry->base_addr	= base_addr;
+	),
+
+	TP_printk("base_addr=%p", __entry->base_addr)
+);
+
+TRACE_EVENT(percpu_destroy_chunk,
+
+	TP_PROTO(void *base_addr),
+
+	TP_ARGS(base_addr),
+
+	TP_STRUCT__entry(
+		__field(	void *,	base_addr	)
+	),
+
+	TP_fast_assign(
+		__entry->base_addr	= base_addr;
+	),
+
+	TP_printk("base_addr=%p", __entry->base_addr)
+);
+
+#endif /* _TRACE_PERCPU_H */
+
+#include <trace/define_trace.h>
diff --git a/lib/flex_proportions.c b/lib/flex_proportions.c
index a71cf1bdd4c94a92c4c888be21cad8eebf813c99..2cc1f94e03a1dbb789608e2ccad119a927d414fc 100644
--- a/lib/flex_proportions.c
+++ b/lib/flex_proportions.c
@@ -207,7 +207,7 @@ static void fprop_reflect_period_percpu(struct fprop_global *p,
 		if (val < (nr_cpu_ids * PROP_BATCH))
 			val = percpu_counter_sum(&pl->events);
 
-		__percpu_counter_add(&pl->events,
+		percpu_counter_add_batch(&pl->events,
 			-val + (val >> (period-pl->period)), PROP_BATCH);
 	} else
 		percpu_counter_set(&pl->events, 0);
@@ -219,7 +219,7 @@ static void fprop_reflect_period_percpu(struct fprop_global *p,
 void __fprop_inc_percpu(struct fprop_global *p, struct fprop_local_percpu *pl)
 {
 	fprop_reflect_period_percpu(p, pl);
-	__percpu_counter_add(&pl->events, 1, PROP_BATCH);
+	percpu_counter_add_batch(&pl->events, 1, PROP_BATCH);
 	percpu_counter_add(&p->events, 1);
 }
 
@@ -267,6 +267,6 @@ void __fprop_inc_percpu_max(struct fprop_global *p,
 			return;
 	} else
 		fprop_reflect_period_percpu(p, pl);
-	__percpu_counter_add(&pl->events, 1, PROP_BATCH);
+	percpu_counter_add_batch(&pl->events, 1, PROP_BATCH);
 	percpu_counter_add(&p->events, 1);
 }
diff --git a/lib/percpu_counter.c b/lib/percpu_counter.c
index 9c21000df0b5ea1b99a83fd73a338073cb7fd016..8ee7e5ec21be23f658323a9cea840e3d7fb41ce6 100644
--- a/lib/percpu_counter.c
+++ b/lib/percpu_counter.c
@@ -72,7 +72,7 @@ void percpu_counter_set(struct percpu_counter *fbc, s64 amount)
 }
 EXPORT_SYMBOL(percpu_counter_set);
 
-void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
+void percpu_counter_add_batch(struct percpu_counter *fbc, s64 amount, s32 batch)
 {
 	s64 count;
 
@@ -89,7 +89,7 @@ void __percpu_counter_add(struct percpu_counter *fbc, s64 amount, s32 batch)
 	}
 	preempt_enable();
 }
-EXPORT_SYMBOL(__percpu_counter_add);
+EXPORT_SYMBOL(percpu_counter_add_batch);
 
 /*
  * Add up all the per-cpu counts, return the result.  This is a more accurate
diff --git a/mm/Kconfig b/mm/Kconfig
index 398b460645447f266d7764bb6d0532609159e630..665cb370ad3839d5d6d68e91790fc76c734e9cd7 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -706,3 +706,11 @@ config ARCH_USES_HIGH_VMA_FLAGS
 	bool
 config ARCH_HAS_PKEYS
 	bool
+
+config PERCPU_STATS
+	bool "Collect percpu memory statistics"
+	default n
+	help
+	  This feature collects and exposes statistics via debugfs. The
+	  information includes global and per chunk statistics, which can
+	  be used to help understand percpu memory usage.
diff --git a/mm/Makefile b/mm/Makefile
index 026f6a828a5023cdfa1b3239075a77c904409463..411bd24d4a7c78dabc1f508201feb5795a8c5ad8 100644
--- a/mm/Makefile
+++ b/mm/Makefile
@@ -103,3 +103,4 @@ obj-$(CONFIG_IDLE_PAGE_TRACKING) += page_idle.o
 obj-$(CONFIG_FRAME_VECTOR) += frame_vector.o
 obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o
 obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o
+obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o
diff --git a/mm/percpu-internal.h b/mm/percpu-internal.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd2442e13d8f5add724bb6be9de1c3dc2ae8f715
--- /dev/null
+++ b/mm/percpu-internal.h
@@ -0,0 +1,166 @@
+#ifndef _MM_PERCPU_INTERNAL_H
+#define _MM_PERCPU_INTERNAL_H
+
+#include <linux/types.h>
+#include <linux/percpu.h>
+
+struct pcpu_chunk {
+#ifdef CONFIG_PERCPU_STATS
+	int			nr_alloc;	/* # of allocations */
+	size_t			max_alloc_size; /* largest allocation size */
+#endif
+
+	struct list_head	list;		/* linked to pcpu_slot lists */
+	int			free_size;	/* free bytes in the chunk */
+	int			contig_hint;	/* max contiguous size hint */
+	void			*base_addr;	/* base address of this chunk */
+
+	int			map_used;	/* # of map entries used before the sentry */
+	int			map_alloc;	/* # of map entries allocated */
+	int			*map;		/* allocation map */
+	struct list_head	map_extend_list;/* on pcpu_map_extend_chunks */
+
+	void			*data;		/* chunk data */
+	int			first_free;	/* no free below this */
+	bool			immutable;	/* no [de]population allowed */
+	bool			has_reserved;	/* Indicates if chunk has reserved space
+						   at the beginning. Reserved chunk will
+						   contain reservation for static chunk.
+						   Dynamic chunk will contain reservation
+						   for static and reserved chunks. */
+	int			nr_populated;	/* # of populated pages */
+	unsigned long		populated[];	/* populated bitmap */
+};
+
+extern spinlock_t pcpu_lock;
+
+extern struct list_head *pcpu_slot;
+extern int pcpu_nr_slots;
+
+extern struct pcpu_chunk *pcpu_first_chunk;
+extern struct pcpu_chunk *pcpu_reserved_chunk;
+
+#ifdef CONFIG_PERCPU_STATS
+
+#include <linux/spinlock.h>
+
+struct percpu_stats {
+	u64 nr_alloc;		/* lifetime # of allocations */
+	u64 nr_dealloc;		/* lifetime # of deallocations */
+	u64 nr_cur_alloc;	/* current # of allocations */
+	u64 nr_max_alloc;	/* max # of live allocations */
+	u32 nr_chunks;		/* current # of live chunks */
+	u32 nr_max_chunks;	/* max # of live chunks */
+	size_t min_alloc_size;	/* min allocaiton size */
+	size_t max_alloc_size;	/* max allocation size */
+};
+
+extern struct percpu_stats pcpu_stats;
+extern struct pcpu_alloc_info pcpu_stats_ai;
+
+/*
+ * For debug purposes. We don't care about the flexible array.
+ */
+static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
+{
+	memcpy(&pcpu_stats_ai, ai, sizeof(struct pcpu_alloc_info));
+
+	/* initialize min_alloc_size to unit_size */
+	pcpu_stats.min_alloc_size = pcpu_stats_ai.unit_size;
+}
+
+/*
+ * pcpu_stats_area_alloc - increment area allocation stats
+ * @chunk: the location of the area being allocated
+ * @size: size of area to allocate in bytes
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
+{
+	lockdep_assert_held(&pcpu_lock);
+
+	pcpu_stats.nr_alloc++;
+	pcpu_stats.nr_cur_alloc++;
+	pcpu_stats.nr_max_alloc =
+		max(pcpu_stats.nr_max_alloc, pcpu_stats.nr_cur_alloc);
+	pcpu_stats.min_alloc_size =
+		min(pcpu_stats.min_alloc_size, size);
+	pcpu_stats.max_alloc_size =
+		max(pcpu_stats.max_alloc_size, size);
+
+	chunk->nr_alloc++;
+	chunk->max_alloc_size = max(chunk->max_alloc_size, size);
+}
+
+/*
+ * pcpu_stats_area_dealloc - decrement allocation stats
+ * @chunk: the location of the area being deallocated
+ *
+ * CONTEXT:
+ * pcpu_lock.
+ */
+static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
+{
+	lockdep_assert_held(&pcpu_lock);
+
+	pcpu_stats.nr_dealloc++;
+	pcpu_stats.nr_cur_alloc--;
+
+	chunk->nr_alloc--;
+}
+
+/*
+ * pcpu_stats_chunk_alloc - increment chunk stats
+ */
+static inline void pcpu_stats_chunk_alloc(void)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&pcpu_lock, flags);
+
+	pcpu_stats.nr_chunks++;
+	pcpu_stats.nr_max_chunks =
+		max(pcpu_stats.nr_max_chunks, pcpu_stats.nr_chunks);
+
+	spin_unlock_irqrestore(&pcpu_lock, flags);
+}
+
+/*
+ * pcpu_stats_chunk_dealloc - decrement chunk stats
+ */
+static inline void pcpu_stats_chunk_dealloc(void)
+{
+	unsigned long flags;
+	spin_lock_irqsave(&pcpu_lock, flags);
+
+	pcpu_stats.nr_chunks--;
+
+	spin_unlock_irqrestore(&pcpu_lock, flags);
+}
+
+#else
+
+static inline void pcpu_stats_save_ai(const struct pcpu_alloc_info *ai)
+{
+}
+
+static inline void pcpu_stats_area_alloc(struct pcpu_chunk *chunk, size_t size)
+{
+}
+
+static inline void pcpu_stats_area_dealloc(struct pcpu_chunk *chunk)
+{
+}
+
+static inline void pcpu_stats_chunk_alloc(void)
+{
+}
+
+static inline void pcpu_stats_chunk_dealloc(void)
+{
+}
+
+#endif /* !CONFIG_PERCPU_STATS */
+
+#endif
diff --git a/mm/percpu-km.c b/mm/percpu-km.c
index d66911ff42d9618dd73107548e4f99b06d2f98ef..eb58aa4c0997540fb3eb912ee6449d2daf7d3ab2 100644
--- a/mm/percpu-km.c
+++ b/mm/percpu-km.c
@@ -72,6 +72,9 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
 	pcpu_chunk_populated(chunk, 0, nr_pages);
 	spin_unlock_irq(&pcpu_lock);
 
+	pcpu_stats_chunk_alloc();
+	trace_percpu_create_chunk(chunk->base_addr);
+
 	return chunk;
 }
 
@@ -79,7 +82,13 @@ static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
 {
 	const int nr_pages = pcpu_group_sizes[0] >> PAGE_SHIFT;
 
-	if (chunk && chunk->data)
+	if (!chunk)
+		return;
+
+	pcpu_stats_chunk_dealloc();
+	trace_percpu_destroy_chunk(chunk->base_addr);
+
+	if (chunk->data)
 		__free_pages(chunk->data, order_base_2(nr_pages));
 	pcpu_free_chunk(chunk);
 }
diff --git a/mm/percpu-stats.c b/mm/percpu-stats.c
new file mode 100644
index 0000000000000000000000000000000000000000..03524a56eefff19c9b093da5343e9196023068bb
--- /dev/null
+++ b/mm/percpu-stats.c
@@ -0,0 +1,222 @@
+/*
+ * mm/percpu-debug.c
+ *
+ * Copyright (C) 2017		Facebook Inc.
+ * Copyright (C) 2017		Dennis Zhou <dennisz@fb.com>
+ *
+ * This file is released under the GPLv2.
+ *
+ * Prints statistics about the percpu allocator and backing chunks.
+ */
+#include <linux/debugfs.h>
+#include <linux/list.h>
+#include <linux/percpu.h>
+#include <linux/seq_file.h>
+#include <linux/sort.h>
+#include <linux/vmalloc.h>
+
+#include "percpu-internal.h"
+
+#define P(X, Y) \
+	seq_printf(m, "  %-24s: %8lld\n", X, (long long int)Y)
+
+struct percpu_stats pcpu_stats;
+struct pcpu_alloc_info pcpu_stats_ai;
+
+static int cmpint(const void *a, const void *b)
+{
+	return *(int *)a - *(int *)b;
+}
+
+/*
+ * Iterates over all chunks to find the max # of map entries used.
+ */
+static int find_max_map_used(void)
+{
+	struct pcpu_chunk *chunk;
+	int slot, max_map_used;
+
+	max_map_used = 0;
+	for (slot = 0; slot < pcpu_nr_slots; slot++)
+		list_for_each_entry(chunk, &pcpu_slot[slot], list)
+			max_map_used = max(max_map_used, chunk->map_used);
+
+	return max_map_used;
+}
+
+/*
+ * Prints out chunk state. Fragmentation is considered between
+ * the beginning of the chunk to the last allocation.
+ */
+static void chunk_map_stats(struct seq_file *m, struct pcpu_chunk *chunk,
+			    void *buffer)
+{
+	int i, s_index, last_alloc, alloc_sign, as_len;
+	int *alloc_sizes, *p;
+	/* statistics */
+	int sum_frag = 0, max_frag = 0;
+	int cur_min_alloc = 0, cur_med_alloc = 0, cur_max_alloc = 0;
+
+	alloc_sizes = buffer;
+	s_index = chunk->has_reserved ? 1 : 0;
+
+	/* find last allocation */
+	last_alloc = -1;
+	for (i = chunk->map_used - 1; i >= s_index; i--) {
+		if (chunk->map[i] & 1) {
+			last_alloc = i;
+			break;
+		}
+	}
+
+	/* if the chunk is not empty - ignoring reserve */
+	if (last_alloc >= s_index) {
+		as_len = last_alloc + 1 - s_index;
+
+		/*
+		 * Iterate through chunk map computing size info.
+		 * The first bit is overloaded to be a used flag.
+		 * negative = free space, positive = allocated
+		 */
+		for (i = 0, p = chunk->map + s_index; i < as_len; i++, p++) {
+			alloc_sign = (*p & 1) ? 1 : -1;
+			alloc_sizes[i] = alloc_sign *
+				((p[1] & ~1) - (p[0] & ~1));
+		}
+
+		sort(alloc_sizes, as_len, sizeof(chunk->map[0]), cmpint, NULL);
+
+		/* Iterate through the unallocated fragements. */
+		for (i = 0, p = alloc_sizes; *p < 0 && i < as_len; i++, p++) {
+			sum_frag -= *p;
+			max_frag = max(max_frag, -1 * (*p));
+		}
+
+		cur_min_alloc = alloc_sizes[i];
+		cur_med_alloc = alloc_sizes[(i + as_len - 1) / 2];
+		cur_max_alloc = alloc_sizes[as_len - 1];
+	}
+
+	P("nr_alloc", chunk->nr_alloc);
+	P("max_alloc_size", chunk->max_alloc_size);
+	P("free_size", chunk->free_size);
+	P("contig_hint", chunk->contig_hint);
+	P("sum_frag", sum_frag);
+	P("max_frag", max_frag);
+	P("cur_min_alloc", cur_min_alloc);
+	P("cur_med_alloc", cur_med_alloc);
+	P("cur_max_alloc", cur_max_alloc);
+	seq_putc(m, '\n');
+}
+
+static int percpu_stats_show(struct seq_file *m, void *v)
+{
+	struct pcpu_chunk *chunk;
+	int slot, max_map_used;
+	void *buffer;
+
+alloc_buffer:
+	spin_lock_irq(&pcpu_lock);
+	max_map_used = find_max_map_used();
+	spin_unlock_irq(&pcpu_lock);
+
+	buffer = vmalloc(max_map_used * sizeof(pcpu_first_chunk->map[0]));
+	if (!buffer)
+		return -ENOMEM;
+
+	spin_lock_irq(&pcpu_lock);
+
+	/* if the buffer allocated earlier is too small */
+	if (max_map_used < find_max_map_used()) {
+		spin_unlock_irq(&pcpu_lock);
+		vfree(buffer);
+		goto alloc_buffer;
+	}
+
+#define PL(X) \
+	seq_printf(m, "  %-24s: %8lld\n", #X, (long long int)pcpu_stats_ai.X)
+
+	seq_printf(m,
+			"Percpu Memory Statistics\n"
+			"Allocation Info:\n"
+			"----------------------------------------\n");
+	PL(unit_size);
+	PL(static_size);
+	PL(reserved_size);
+	PL(dyn_size);
+	PL(atom_size);
+	PL(alloc_size);
+	seq_putc(m, '\n');
+
+#undef PL
+
+#define PU(X) \
+	seq_printf(m, "  %-18s: %14llu\n", #X, (unsigned long long)pcpu_stats.X)
+
+	seq_printf(m,
+			"Global Stats:\n"
+			"----------------------------------------\n");
+	PU(nr_alloc);
+	PU(nr_dealloc);
+	PU(nr_cur_alloc);
+	PU(nr_max_alloc);
+	PU(nr_chunks);
+	PU(nr_max_chunks);
+	PU(min_alloc_size);
+	PU(max_alloc_size);
+	seq_putc(m, '\n');
+
+#undef PU
+
+	seq_printf(m,
+			"Per Chunk Stats:\n"
+			"----------------------------------------\n");
+
+	if (pcpu_reserved_chunk) {
+		seq_puts(m, "Chunk: <- Reserved Chunk\n");
+		chunk_map_stats(m, pcpu_reserved_chunk, buffer);
+	}
+
+	for (slot = 0; slot < pcpu_nr_slots; slot++) {
+		list_for_each_entry(chunk, &pcpu_slot[slot], list) {
+			if (chunk == pcpu_first_chunk) {
+				seq_puts(m, "Chunk: <- First Chunk\n");
+				chunk_map_stats(m, chunk, buffer);
+
+
+			} else {
+				seq_puts(m, "Chunk:\n");
+				chunk_map_stats(m, chunk, buffer);
+			}
+
+		}
+	}
+
+	spin_unlock_irq(&pcpu_lock);
+
+	vfree(buffer);
+
+	return 0;
+}
+
+static int percpu_stats_open(struct inode *inode, struct file *filp)
+{
+	return single_open(filp, percpu_stats_show, NULL);
+}
+
+static const struct file_operations percpu_stats_fops = {
+	.open		= percpu_stats_open,
+	.read		= seq_read,
+	.llseek		= seq_lseek,
+	.release	= single_release,
+};
+
+static int __init init_percpu_stats_debugfs(void)
+{
+	debugfs_create_file("percpu_stats", 0444, NULL, NULL,
+			&percpu_stats_fops);
+
+	return 0;
+}
+
+late_initcall(init_percpu_stats_debugfs);
diff --git a/mm/percpu-vm.c b/mm/percpu-vm.c
index 9ac639499bd1146347557141b10f1135ee2c0048..15dab691ea7076c99e198506b69128049f1555ad 100644
--- a/mm/percpu-vm.c
+++ b/mm/percpu-vm.c
@@ -343,12 +343,22 @@ static struct pcpu_chunk *pcpu_create_chunk(void)
 
 	chunk->data = vms;
 	chunk->base_addr = vms[0]->addr - pcpu_group_offsets[0];
+
+	pcpu_stats_chunk_alloc();
+	trace_percpu_create_chunk(chunk->base_addr);
+
 	return chunk;
 }
 
 static void pcpu_destroy_chunk(struct pcpu_chunk *chunk)
 {
-	if (chunk && chunk->data)
+	if (!chunk)
+		return;
+
+	pcpu_stats_chunk_dealloc();
+	trace_percpu_destroy_chunk(chunk->base_addr);
+
+	if (chunk->data)
 		pcpu_free_vm_areas(chunk->data, pcpu_nr_groups);
 	pcpu_free_chunk(chunk);
 }
diff --git a/mm/percpu.c b/mm/percpu.c
index e0aa8ae7bde708188e2d6ba84dbdc12dcf11f52e..bd4130a69bbc9b6b631baf911e66aab31f08b6e0 100644
--- a/mm/percpu.c
+++ b/mm/percpu.c
@@ -76,6 +76,11 @@
 #include <asm/tlbflush.h>
 #include <asm/io.h>
 
+#define CREATE_TRACE_POINTS
+#include <trace/events/percpu.h>
+
+#include "percpu-internal.h"
+
 #define PCPU_SLOT_BASE_SHIFT		5	/* 1-31 shares the same slot */
 #define PCPU_DFL_MAP_ALLOC		16	/* start a map with 16 ents */
 #define PCPU_ATOMIC_MAP_MARGIN_LOW	32
@@ -103,53 +108,35 @@
 #define __pcpu_ptr_to_addr(ptr)		(void __force *)(ptr)
 #endif	/* CONFIG_SMP */
 
-struct pcpu_chunk {
-	struct list_head	list;		/* linked to pcpu_slot lists */
-	int			free_size;	/* free bytes in the chunk */
-	int			contig_hint;	/* max contiguous size hint */
-	void			*base_addr;	/* base address of this chunk */
-
-	int			map_used;	/* # of map entries used before the sentry */
-	int			map_alloc;	/* # of map entries allocated */
-	int			*map;		/* allocation map */
-	struct list_head	map_extend_list;/* on pcpu_map_extend_chunks */
-
-	void			*data;		/* chunk data */
-	int			first_free;	/* no free below this */
-	bool			immutable;	/* no [de]population allowed */
-	int			nr_populated;	/* # of populated pages */
-	unsigned long		populated[];	/* populated bitmap */
-};
-
-static int pcpu_unit_pages __read_mostly;
-static int pcpu_unit_size __read_mostly;
-static int pcpu_nr_units __read_mostly;
-static int pcpu_atom_size __read_mostly;
-static int pcpu_nr_slots __read_mostly;
-static size_t pcpu_chunk_struct_size __read_mostly;
+static int pcpu_unit_pages __ro_after_init;
+static int pcpu_unit_size __ro_after_init;
+static int pcpu_nr_units __ro_after_init;
+static int pcpu_atom_size __ro_after_init;
+int pcpu_nr_slots __ro_after_init;
+static size_t pcpu_chunk_struct_size __ro_after_init;
 
 /* cpus with the lowest and highest unit addresses */
-static unsigned int pcpu_low_unit_cpu __read_mostly;
-static unsigned int pcpu_high_unit_cpu __read_mostly;
+static unsigned int pcpu_low_unit_cpu __ro_after_init;
+static unsigned int pcpu_high_unit_cpu __ro_after_init;
 
 /* the address of the first chunk which starts with the kernel static area */
-void *pcpu_base_addr __read_mostly;
+void *pcpu_base_addr __ro_after_init;
 EXPORT_SYMBOL_GPL(pcpu_base_addr);
 
-static const int *pcpu_unit_map __read_mostly;		/* cpu -> unit */
-const unsigned long *pcpu_unit_offsets __read_mostly;	/* cpu -> unit offset */
+static const int *pcpu_unit_map __ro_after_init;		/* cpu -> unit */
+const unsigned long *pcpu_unit_offsets __ro_after_init;	/* cpu -> unit offset */
 
 /* group information, used for vm allocation */
-static int pcpu_nr_groups __read_mostly;
-static const unsigned long *pcpu_group_offsets __read_mostly;
-static const size_t *pcpu_group_sizes __read_mostly;
+static int pcpu_nr_groups __ro_after_init;
+static const unsigned long *pcpu_group_offsets __ro_after_init;
+static const size_t *pcpu_group_sizes __ro_after_init;
 
 /*
  * The first chunk which always exists.  Note that unlike other
  * chunks, this one can be allocated and mapped in several different
  * ways and thus often doesn't live in the vmalloc area.
  */
-static struct pcpu_chunk *pcpu_first_chunk;
+struct pcpu_chunk *pcpu_first_chunk __ro_after_init;
 
 /*
  * Optional reserved chunk.  This chunk reserves part of the first
@@ -158,13 +145,13 @@ static struct pcpu_chunk *pcpu_first_chunk;
  * area doesn't exist, the following variables contain NULL and 0
  * respectively.
  */
-static struct pcpu_chunk *pcpu_reserved_chunk;
-static int pcpu_reserved_chunk_limit;
+struct pcpu_chunk *pcpu_reserved_chunk __ro_after_init;
+static int pcpu_reserved_chunk_limit __ro_after_init;
 
-static DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
+DEFINE_SPINLOCK(pcpu_lock);	/* all internal data structures */
 static DEFINE_MUTEX(pcpu_alloc_mutex);	/* chunk create/destroy, [de]pop, map ext */
 
-static struct list_head *pcpu_slot __read_mostly; /* chunk list slots */
+struct list_head *pcpu_slot __ro_after_init; /* chunk list slots */
 
 /* chunks which need their map areas extended, protected by pcpu_lock */
 static LIST_HEAD(pcpu_map_extend_chunks);
@@ -672,6 +659,9 @@ static void pcpu_free_area(struct pcpu_chunk *chunk, int freeme,
 	int to_free = 0;
 	int *p;
 
+	lockdep_assert_held(&pcpu_lock);
+	pcpu_stats_area_dealloc(chunk);
+
 	freeme |= 1;	/* we are searching for <given offset, in use> pair */
 
 	i = 0;
@@ -735,6 +725,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void)
 	chunk->map[0] = 0;
 	chunk->map[1] = pcpu_unit_size | 1;
 	chunk->map_used = 1;
+	chunk->has_reserved = false;
 
 	INIT_LIST_HEAD(&chunk->list);
 	INIT_LIST_HEAD(&chunk->map_extend_list);
@@ -965,8 +956,10 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 	 * tasks to create chunks simultaneously.  Serialize and create iff
 	 * there's still no empty chunk after grabbing the mutex.
 	 */
-	if (is_atomic)
+	if (is_atomic) {
+		err = "atomic alloc failed, no space left";
 		goto fail;
+	}
 
 	if (list_empty(&pcpu_slot[pcpu_nr_slots - 1])) {
 		chunk = pcpu_create_chunk();
@@ -984,6 +977,7 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 	goto restart;
 
 area_found:
+	pcpu_stats_area_alloc(chunk, size);
 	spin_unlock_irqrestore(&pcpu_lock, flags);
 
 	/* populate if not all pages are already there */
@@ -1026,11 +1020,17 @@ static void __percpu *pcpu_alloc(size_t size, size_t align, bool reserved,
 
 	ptr = __addr_to_pcpu_ptr(chunk->base_addr + off);
 	kmemleak_alloc_percpu(ptr, size, gfp);
+
+	trace_percpu_alloc_percpu(reserved, is_atomic, size, align,
+			chunk->base_addr, off, ptr);
+
 	return ptr;
 
 fail_unlock:
 	spin_unlock_irqrestore(&pcpu_lock, flags);
 fail:
+	trace_percpu_alloc_percpu_fail(reserved, is_atomic, size, align);
+
 	if (!is_atomic && warn_limit) {
 		pr_warn("allocation failed, size=%zu align=%zu atomic=%d, %s\n",
 			size, align, is_atomic, err);
@@ -1280,6 +1280,8 @@ void free_percpu(void __percpu *ptr)
 			}
 	}
 
+	trace_percpu_free_percpu(chunk->base_addr, off, ptr);
+
 	spin_unlock_irqrestore(&pcpu_lock, flags);
 }
 EXPORT_SYMBOL_GPL(free_percpu);
@@ -1656,6 +1658,8 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	pcpu_chunk_struct_size = sizeof(struct pcpu_chunk) +
 		BITS_TO_LONGS(pcpu_unit_pages) * sizeof(unsigned long);
 
+	pcpu_stats_save_ai(ai);
+
 	/*
 	 * Allocate chunk slots.  The additional last slot is for
 	 * empty chunks.
@@ -1699,6 +1703,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 	if (schunk->free_size)
 		schunk->map[++schunk->map_used] = ai->static_size + schunk->free_size;
 	schunk->map[schunk->map_used] |= 1;
+	schunk->has_reserved = true;
 
 	/* init dynamic chunk if necessary */
 	if (dyn_size) {
@@ -1717,6 +1722,7 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 		dchunk->map[1] = pcpu_reserved_chunk_limit;
 		dchunk->map[2] = (pcpu_reserved_chunk_limit + dchunk->free_size) | 1;
 		dchunk->map_used = 2;
+		dchunk->has_reserved = true;
 	}
 
 	/* link the first chunk in */
@@ -1725,6 +1731,9 @@ int __init pcpu_setup_first_chunk(const struct pcpu_alloc_info *ai,
 		pcpu_count_occupied_pages(pcpu_first_chunk, 1);
 	pcpu_chunk_relocate(pcpu_first_chunk, -1);
 
+	pcpu_stats_chunk_alloc();
+	trace_percpu_create_chunk(base_addr);
+
 	/* we're done */
 	pcpu_base_addr = base_addr;
 	return 0;