Newer
Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
From 0e955dc47a08815500de2f559d0e6781622cbbf2 Mon Sep 17 00:00:00 2001
From: Daniel Jordan <daniel.m.jordan@oracle.com>
Date: Tue, 26 Nov 2019 02:38:09 -0500
Subject: [PATCH] padata: add basic support for multithreaded jobs
Sometimes the kernel doesn't take full advantage of system memory
bandwidth, leading to a single CPU spending excessive time in
initialization paths where the data scales with memory size.
Multithreading naturally addresses this problem.
Extend padata, a framework that handles many parallel yet singlethreaded
jobs, to also handle multithreaded jobs by adding support for splitting
up the work evenly, specifying a minimum amount of work that's
appropriate for one helper thread to do, load balancing between helpers,
and coordinating them.
This is inspired by work from Pavel Tatashin and Steve Sistare.
Signed-off-by: Daniel Jordan <daniel.m.jordan@oracle.com>
---
include/linux/padata.h | 29 +++++++++
kernel/padata.c | 152 +++++++++++++++++++++++++++++++++++++++++++++++-
2 files changed, 178 insertions(+), 3 deletions(-)
diff --git a/include/linux/padata.h b/include/linux/padata.h
index 3bfa503..b0affa4 100644
--- a/include/linux/padata.h
+++ b/include/linux/padata.h
@@ -4,6 +4,9 @@
*
* Copyright (C) 2008, 2009 secunet Security Networks AG
* Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
+ *
+ * Copyright (c) 2020 Oracle and/or its affiliates.
+ * Author: Daniel Jordan <daniel.m.jordan@oracle.com>
*/
#ifndef PADATA_H
@@ -131,6 +134,31 @@ struct padata_shell {
};
/**
+ * struct padata_mt_job - represents one multithreaded job
+ *
+ * @thread_fn: Called for each chunk of work that a padata thread does.
+ * @fn_arg: The thread function argument.
+ * @start: The start of the job (units are job-specific).
+ * @size: size of this node's work (units are job-specific).
+ * @align: Ranges passed to the thread function fall on this boundary, with the
+ * possible exceptions of the beginning and end of the job.
+ * @min_chunk: The minimum chunk size in job-specific units. This allows
+ * the client to communicate the minimum amount of work that's
+ * appropriate for one worker thread to do at once.
+ * @max_threads: Max threads to use for the job, actual number may be less
+ * depending on task size and minimum chunk size.
+ */
+struct padata_mt_job {
+ void (*thread_fn)(unsigned long start, unsigned long end, void *arg);
+ void *fn_arg;
+ unsigned long start;
+ unsigned long size;
+ unsigned long align;
+ unsigned long min_chunk;
+ int max_threads;
+};
+
+/**
* struct padata_instance - The overall control structure.
*
* @node: Used by CPU hotplug.
@@ -171,6 +199,7 @@ extern void padata_free_shell(struct padata_shell *ps);
extern int padata_do_parallel(struct padata_shell *ps,
struct padata_priv *padata, int *cb_cpu);
extern void padata_do_serial(struct padata_priv *padata);
+extern void __init padata_do_multithreaded(struct padata_mt_job *job);
extern int padata_set_cpumask(struct padata_instance *pinst, int cpumask_type,
cpumask_var_t cpumask);
extern int padata_start(struct padata_instance *pinst);
diff --git a/kernel/padata.c b/kernel/padata.c
index edd3ff5..ccb617d 100644
--- a/kernel/padata.c
+++ b/kernel/padata.c
@@ -7,6 +7,9 @@
* Copyright (C) 2008, 2009 secunet Security Networks AG
* Copyright (C) 2008, 2009 Steffen Klassert <steffen.klassert@secunet.com>
*
+ * Copyright (c) 2020 Oracle and/or its affiliates.
+ * Author: Daniel Jordan <daniel.m.jordan@oracle.com>
+ *
* This program is free software; you can redistribute it and/or modify it
* under the terms and conditions of the GNU General Public License,
* version 2, as published by the Free Software Foundation.
@@ -21,6 +24,7 @@
* 51 Franklin St - Fifth Floor, Boston, MA 02110-1301 USA.
*/
+#include <linux/completion.h>
#include <linux/export.h>
#include <linux/cpumask.h>
#include <linux/err.h>
@@ -32,6 +36,8 @@
#include <linux/sysfs.h>
#include <linux/rcupdate.h>
+#define PADATA_WORK_ONSTACK 1 /* Work's memory is on stack */
+
struct padata_work {
struct work_struct pw_work;
struct list_head pw_list; /* padata_free_works linkage */
@@ -42,7 +48,17 @@ static DEFINE_SPINLOCK(padata_works_lock);
static struct padata_work *padata_works;
static LIST_HEAD(padata_free_works);
+struct padata_mt_job_state {
+ spinlock_t lock;
+ struct completion completion;
+ struct padata_mt_job *job;
+ int nworks;
+ int nworks_fini;
+ unsigned long chunk_size;
+};
+
static void padata_free_pd(struct parallel_data *pd);
+static void __init padata_mt_helper(struct work_struct *work);
static int padata_index_to_cpu(struct parallel_data *pd, int cpu_index)
{
@@ -81,18 +97,56 @@ static struct padata_work *padata_work_alloc(void)
}
static void padata_work_init(struct padata_work *pw, work_func_t work_fn,
- void *data)
+ void *data, int flags)
{
- INIT_WORK(&pw->pw_work, work_fn);
+ if (flags & PADATA_WORK_ONSTACK)
+ INIT_WORK_ONSTACK(&pw->pw_work, work_fn);
+ else
+ INIT_WORK(&pw->pw_work, work_fn);
pw->pw_data = data;
}
+static int __init padata_work_alloc_mt(int nworks, void *data,
+ struct list_head *head)
+{
+ int i;
+
+ spin_lock(&padata_works_lock);
+ /* Start at 1 because the current task participates in the job. */
+ for (i = 1; i < nworks; ++i) {
+ struct padata_work *pw = padata_work_alloc();
+
+ if (!pw)
+ break;
+ padata_work_init(pw, padata_mt_helper, data, 0);
+ list_add(&pw->pw_list, head);
+ }
+ spin_unlock(&padata_works_lock);
+
+ return i;
+}
+
static void padata_work_free(struct padata_work *pw)
{
lockdep_assert_held(&padata_works_lock);
list_add(&pw->pw_list, &padata_free_works);
}
+static void __init padata_works_free(struct list_head *works)
+{
+ struct padata_work *cur, *next;
+
+ if (list_empty(works))
+ return;
+
+ spin_lock(&padata_works_lock);
+ list_for_each_entry_safe(cur, next, works, pw_list) {
+ list_del(&cur->pw_list);
+ padata_work_free(cur);
+ }
+ spin_unlock(&padata_works_lock);
+}
+
static void padata_parallel_worker(struct work_struct *parallel_work)
{
struct padata_work *pw = container_of(parallel_work, struct padata_work,
@@ -168,7 +222,7 @@ int padata_do_parallel(struct padata_shell *ps,
pw = padata_work_alloc();
spin_unlock(&padata_works_lock);
if (pw) {
- padata_work_init(pw, padata_parallel_worker, padata);
+ padata_work_init(pw, padata_parallel_worker, padata, 0);
queue_work(pinst->parallel_wq, &pw->pw_work);
} else {
/* Maximum works limit exceeded, run in the current task. */
@@ -409,6 +463,98 @@ out:
return err;
}
+static void __init padata_mt_helper(struct work_struct *w)
+{
+ struct padata_work *pw = container_of(w, struct padata_work, pw_work);
+ struct padata_mt_job_state *ps = pw->pw_data;
+ struct padata_mt_job *job = ps->job;
+ bool done;
+
+ spin_lock(&ps->lock);
+
+ while (job->size > 0) {
+ unsigned long start, size, end;
+
+ start = job->start;
+ /* So end is chunk size aligned if enough work remains. */
+ size = roundup(start + 1, ps->chunk_size) - start;
+ size = min(size, job->size);
+ end = start + size;
+
+ job->start = end;
+ job->size -= size;
+
+ spin_unlock(&ps->lock);
+ job->thread_fn(start, end, job->fn_arg);
+ spin_lock(&ps->lock);
+ }
+
+ ++ps->nworks_fini;
+ done = (ps->nworks_fini == ps->nworks);
+ spin_unlock(&ps->lock);
+
+ if (done)
+ complete(&ps->completion);
+}
+
+/**
+ * padata_do_multithreaded - run a multithreaded job
+ * @job: Description of the job.
+ *
+ * See the definition of struct padata_mt_job for more details.
+ */
+void __init padata_do_multithreaded(struct padata_mt_job *job)
+{
+ /* In case threads finish at different times. */
+ static const unsigned long load_balance_factor = 4;
+ struct padata_work my_work, *pw;
+ struct padata_mt_job_state ps;
+ LIST_HEAD(works);
+ int nworks;
+
+ if (job->size == 0)
+ return;
+
+ /* Ensure at least one thread when size < min_chunk. */
+ nworks = max(job->size / job->min_chunk, 1ul);
+ nworks = min(nworks, job->max_threads);
+
+ if (nworks == 1) {
+ /* Single thread, no coordination needed, cut to the chase. */
+ job->thread_fn(job->start, job->start + job->size, job->fn_arg);
+ return;
+ }
+
+ spin_lock_init(&ps.lock);
+ init_completion(&ps.completion);
+ ps.job = job;
+ ps.nworks = padata_work_alloc_mt(nworks, &ps, &works);
+ ps.nworks_fini = 0;
+
+ /*
+ * Chunk size is the amount of work a helper does per call to the
+ * thread function. Load balance large jobs between threads by
+ * increasing the number of chunks, guarantee at least the minimum
+ * chunk size from the caller, and honor the caller's alignment.
+ */
+ ps.chunk_size = job->size / (ps.nworks * load_balance_factor);
+ ps.chunk_size = max(ps.chunk_size, job->min_chunk);
+ ps.chunk_size = roundup(ps.chunk_size, job->align);
+
+ list_for_each_entry(pw, &works, pw_list)
+ queue_work(system_unbound_wq, &pw->pw_work);
+
+ /* Use the current thread, which saves starting a workqueue worker. */
+ padata_work_init(&my_work, padata_mt_helper, &ps, PADATA_WORK_ONSTACK);
+ padata_mt_helper(&my_work.pw_work);
+
+ /* Wait for all the helpers to finish. */
+ wait_for_completion(&ps.completion);
+
+ destroy_work_on_stack(&my_work.pw_work);
+ padata_works_free(&works);
+}
+
static void __padata_list_init(struct padata_list *pd_list)
{
INIT_LIST_HEAD(&pd_list->list);
--
1.7.4.1