Commit 45b07ef3 authored by Paul Jackson's avatar Paul Jackson Committed by Linus Torvalds
Browse files

[PATCH] cpusets: swap migration interface

Add a boolean "memory_migrate" to each cpuset, represented by a file
containing "0" or "1" in each directory below /dev/cpuset.

It defaults to false (file contains "0").  It can be set true by writing
"1" to the file.

If true, then anytime that a task is attached to the cpuset so marked, the
pages of that task will be moved to that cpuset, preserving, to the extent
practical, the cpuset-relative placement of the pages.

Also anytime that a cpuset so marked has its memory placement changed (by
writing to its "mems" file), the tasks in that cpuset will have their pages
moved to the cpusets new nodes, preserving, to the extent practical, the
cpuset-relative placement of the moved pages.
Signed-off-by: default avatarPaul Jackson <>
Cc: Christoph Lameter <>
Signed-off-by: default avatarAndrew Morton <>
Signed-off-by: default avatarLinus Torvalds <>
parent d0d96328
......@@ -192,6 +192,7 @@ containing the following files describing that cpuset:
- cpus: list of CPUs in that cpuset
- mems: list of Memory Nodes in that cpuset
- memory_migrate flag: if set, move pages to cpusets nodes
- cpu_exclusive flag: is cpu placement exclusive?
- mem_exclusive flag: is memory placement exclusive?
- tasks: list of tasks (by pid) attached to that cpuset
......@@ -277,6 +278,30 @@ rewritten to the 'tasks' file of its cpuset. This is done to avoid
impacting the scheduler code in the kernel with a check for changes
in a tasks processor placement.
Normally, once a page is allocated (given a physical page
of main memory) then that page stays on whatever node it
was allocated, so long as it remains allocated, even if the
cpusets memory placement policy 'mems' subsequently changes.
If the cpuset flag file 'memory_migrate' is set true, then when
tasks are attached to that cpuset, any pages that task had
allocated to it on nodes in its previous cpuset are migrated
to the tasks new cpuset. Depending on the implementation,
this migration may either be done by swapping the page out,
so that the next time the page is referenced, it will be paged
into the tasks new cpuset, usually on the node where it was
referenced, or this migration may be done by directly copying
the pages from the tasks previous cpuset to the new cpuset,
where possible to the same node, relative to the new cpuset,
as the node that held the page, relative to the old cpuset.
Also if 'memory_migrate' is set true, then if that cpusets
'mems' file is modified, pages allocated to tasks in that
cpuset, that were on nodes in the previous setting of 'mems',
will be moved to nodes in the new setting of 'mems.' Again,
depending on the implementation, this might be done by swapping,
or by direct copying. In either case, pages that were not in
the tasks prior cpuset, or in the cpusets prior 'mems' setting,
will not be moved.
There is an exception to the above. If hotplug functionality is used
to remove all the CPUs that are currently assigned to a cpuset,
then the kernel will automatically update the cpus_allowed of all
......@@ -235,6 +235,13 @@ static inline struct zonelist *huge_zonelist(struct vm_area_struct *vma,
return NODE_DATA(0)->node_zonelists + gfp_zone(GFP_HIGHUSER);
static inline int do_migrate_pages(struct mm_struct *mm,
const nodemask_t *from_nodes,
const nodemask_t *to_nodes, int flags)
return 0;
static inline void check_highest_zone(int k)
......@@ -87,6 +87,7 @@ struct cpuset {
typedef enum {
} cpuset_flagbits_t;
......@@ -112,6 +113,11 @@ static inline int notify_on_release(const struct cpuset *cs)
return !!test_bit(CS_NOTIFY_ON_RELEASE, &cs->flags);
static inline int is_memory_migrate(const struct cpuset *cs)
return !!test_bit(CS_MEMORY_MIGRATE, &cs->flags);
* Increment this atomic integer everytime any cpuset changes its
* mems_allowed value. Users of cpusets can track this generation
......@@ -602,16 +608,24 @@ static void refresh_mems(void)
if (current->cpuset_mems_generation != my_cpusets_mem_gen) {
struct cpuset *cs;
nodemask_t oldmem = current->mems_allowed;
int migrate;
cs = current->cpuset;
migrate = is_memory_migrate(cs);
guarantee_online_mems(cs, &current->mems_allowed);
current->cpuset_mems_generation = cs->mems_generation;
if (!nodes_equal(oldmem, current->mems_allowed))
if (!nodes_equal(oldmem, current->mems_allowed)) {
numa_policy_rebind(&oldmem, &current->mems_allowed);
if (migrate) {
do_migrate_pages(current->mm, &oldmem,
......@@ -795,7 +809,7 @@ static int update_nodemask(struct cpuset *cs, char *buf)
* update_flag - read a 0 or a 1 in a file and update associated flag
* bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
* cs: the cpuset to update
* buf: the buffer where we read the 0 or 1
......@@ -848,6 +862,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
struct task_struct *tsk;
struct cpuset *oldcs;
cpumask_t cpus;
nodemask_t from, to;
if (sscanf(pidbuf, "%d", &pid) != 1)
return -EIO;
......@@ -893,7 +908,12 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
guarantee_online_cpus(cs, &cpus);
set_cpus_allowed(tsk, cpus);
from = oldcs->mems_allowed;
to = cs->mems_allowed;
if (is_memory_migrate(cs))
do_migrate_pages(tsk->mm, &from, &to, MPOL_MF_MOVE_ALL);
if (atomic_dec_and_test(&oldcs->count))
check_for_release(oldcs, ppathbuf);
......@@ -905,6 +925,7 @@ static int attach_task(struct cpuset *cs, char *pidbuf, char **ppathbuf)
typedef enum {
......@@ -960,6 +981,9 @@ static ssize_t cpuset_common_file_write(struct file *file, const char __user *us
retval = update_flag(CS_NOTIFY_ON_RELEASE, cs, buffer);
retval = update_flag(CS_MEMORY_MIGRATE, cs, buffer);
retval = attach_task(cs, buffer, &pathbuf);
......@@ -1060,6 +1084,9 @@ static ssize_t cpuset_common_file_read(struct file *file, char __user *buf,
*s++ = notify_on_release(cs) ? '1' : '0';
*s++ = is_memory_migrate(cs) ? '1' : '0';
retval = -EINVAL;
goto out;
......@@ -1408,6 +1435,11 @@ static struct cftype cft_notify_on_release = {
static struct cftype cft_memory_migrate = {
.name = "memory_migrate",
static int cpuset_populate_dir(struct dentry *cs_dentry)
int err;
......@@ -1422,6 +1454,8 @@ static int cpuset_populate_dir(struct dentry *cs_dentry)
return err;
if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
return err;
if ((err = cpuset_add_file(cs_dentry, &cft_memory_migrate)) < 0)
return err;
if ((err = cpuset_add_file(cs_dentry, &cft_tasks)) < 0)
return err;
return 0;
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment