Commit f21a4f75 authored by Tejun Heo's avatar Tejun Heo
Browse files

Merge branch 'for-3.15-fixes' of...

Merge branch 'for-3.15-fixes' of git://git.kernel.org/pub/scm/linux/kernel/git/tj/cgroup into for-3.16

Pull to receive e37a06f1 ("cgroup: fix the retry path of
cgroup_mount()") to avoid unnecessary conflicts with planned
cgroup_tree_mutex removal and also to be able to remove the temp fix
added by 36c38fb7

 ("blkcg: use trylock on blkcg_pol_mutex in
blkcg_reset_stats()") afterwards.
Signed-off-by: default avatarTejun Heo <tj@kernel.org>
parents d39ea871 36e9d2eb
......@@ -451,7 +451,20 @@ static int blkcg_reset_stats(struct cgroup_subsys_state *css,
struct blkcg_gq *blkg;
int i;
mutex_lock(&blkcg_pol_mutex);
/*
* XXX: We invoke cgroup_add/rm_cftypes() under blkcg_pol_mutex
* which ends up putting cgroup's internal cgroup_tree_mutex under
* it; however, cgroup_tree_mutex is nested above cgroup file
* active protection and grabbing blkcg_pol_mutex from a cgroup
* file operation creates a possible circular dependency. cgroup
* internal locking is planned to go through further simplification
* and this issue should go away soon. For now, let's trylock
* blkcg_pol_mutex and restart the write on failure.
*
* http://lkml.kernel.org/g/5363C04B.4010400@oracle.com
*/
if (!mutex_trylock(&blkcg_pol_mutex))
return restart_syscall();
spin_lock_irq(&blkcg->lock);
/*
......
......@@ -521,6 +521,7 @@ struct cftype {
};
extern struct cgroup_root cgrp_dfl_root;
extern struct css_set init_css_set;
static inline bool cgroup_on_dfl(const struct cgroup *cgrp)
{
......@@ -751,6 +752,20 @@ static inline struct cgroup_subsys_state *task_css(struct task_struct *task,
return task_css_check(task, subsys_id, false);
}
/**
* task_css_is_root - test whether a task belongs to the root css
* @task: the target task
* @subsys_id: the target subsystem ID
*
* Test whether @task belongs to the root css on the specified subsystem.
* May be invoked in any context.
*/
static inline bool task_css_is_root(struct task_struct *task, int subsys_id)
{
return task_css_check(task, subsys_id, true) ==
init_css_set.subsys[subsys_id];
}
static inline struct cgroup *task_cgroup(struct task_struct *task,
int subsys_id)
{
......
......@@ -439,7 +439,7 @@ struct cgrp_cset_link {
* reference-counted, to improve performance when child cgroups
* haven't been created.
*/
static struct css_set init_css_set = {
struct css_set init_css_set = {
.refcount = ATOMIC_INIT(1),
.cgrp_links = LIST_HEAD_INIT(init_css_set.cgrp_links),
.tasks = LIST_HEAD_INIT(init_css_set.tasks),
......
......@@ -21,6 +21,7 @@
#include <linux/uaccess.h>
#include <linux/freezer.h>
#include <linux/seq_file.h>
#include <linux/mutex.h>
/*
* A cgroup is freezing if any FREEZING flags are set. FREEZING_SELF is
......@@ -42,9 +43,10 @@ enum freezer_state_flags {
struct freezer {
struct cgroup_subsys_state css;
unsigned int state;
spinlock_t lock;
};
static DEFINE_MUTEX(freezer_mutex);
static inline struct freezer *css_freezer(struct cgroup_subsys_state *css)
{
return css ? container_of(css, struct freezer, css) : NULL;
......@@ -93,7 +95,6 @@ freezer_css_alloc(struct cgroup_subsys_state *parent_css)
if (!freezer)
return ERR_PTR(-ENOMEM);
spin_lock_init(&freezer->lock);
return &freezer->css;
}
......@@ -110,14 +111,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
struct freezer *freezer = css_freezer(css);
struct freezer *parent = parent_freezer(freezer);
/*
* The following double locking and freezing state inheritance
* guarantee that @cgroup can never escape ancestors' freezing
* states. See css_for_each_descendant_pre() for details.
*/
if (parent)
spin_lock_irq(&parent->lock);
spin_lock_nested(&freezer->lock, SINGLE_DEPTH_NESTING);
mutex_lock(&freezer_mutex);
freezer->state |= CGROUP_FREEZER_ONLINE;
......@@ -126,10 +120,7 @@ static int freezer_css_online(struct cgroup_subsys_state *css)
atomic_inc(&system_freezing_cnt);
}
spin_unlock(&freezer->lock);
if (parent)
spin_unlock_irq(&parent->lock);
mutex_unlock(&freezer_mutex);
return 0;
}
......@@ -144,14 +135,14 @@ static void freezer_css_offline(struct cgroup_subsys_state *css)
{
struct freezer *freezer = css_freezer(css);
spin_lock_irq(&freezer->lock);
mutex_lock(&freezer_mutex);
if (freezer->state & CGROUP_FREEZING)
atomic_dec(&system_freezing_cnt);
freezer->state = 0;
spin_unlock_irq(&freezer->lock);
mutex_unlock(&freezer_mutex);
}
static void freezer_css_free(struct cgroup_subsys_state *css)
......@@ -175,7 +166,7 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
struct task_struct *task;
bool clear_frozen = false;
spin_lock_irq(&freezer->lock);
mutex_lock(&freezer_mutex);
/*
* Make the new tasks conform to the current state of @new_css.
......@@ -197,21 +188,13 @@ static void freezer_attach(struct cgroup_subsys_state *new_css,
}
}
spin_unlock_irq(&freezer->lock);
/*
* Propagate FROZEN clearing upwards. We may race with
* update_if_frozen(), but as long as both work bottom-up, either
* update_if_frozen() sees child's FROZEN cleared or we clear the
* parent's FROZEN later. No parent w/ !FROZEN children can be
* left FROZEN.
*/
/* propagate FROZEN clearing upwards */
while (clear_frozen && (freezer = parent_freezer(freezer))) {
spin_lock_irq(&freezer->lock);
freezer->state &= ~CGROUP_FROZEN;
clear_frozen = freezer->state & CGROUP_FREEZING;
spin_unlock_irq(&freezer->lock);
}
mutex_unlock(&freezer_mutex);
}
/**
......@@ -228,9 +211,6 @@ static void freezer_fork(struct task_struct *task)
{
struct freezer *freezer;
rcu_read_lock();
freezer = task_freezer(task);
/*
* The root cgroup is non-freezable, so we can skip locking the
* freezer. This is safe regardless of race with task migration.
......@@ -238,24 +218,18 @@ static void freezer_fork(struct task_struct *task)
* to do. If we lost and root is the new cgroup, noop is still the
* right thing to do.
*/
if (!parent_freezer(freezer))
goto out;
if (task_css_is_root(task, freezer_cgrp_id))
return;
/*
* Grab @freezer->lock and freeze @task after verifying @task still
* belongs to @freezer and it's freezing. The former is for the
* case where we have raced against task migration and lost and
* @task is already in a different cgroup which may not be frozen.
* This isn't strictly necessary as freeze_task() is allowed to be
* called spuriously but let's do it anyway for, if nothing else,
* documentation.
*/
spin_lock_irq(&freezer->lock);
if (freezer == task_freezer(task) && (freezer->state & CGROUP_FREEZING))
mutex_lock(&freezer_mutex);
rcu_read_lock();
freezer = task_freezer(task);
if (freezer->state & CGROUP_FREEZING)
freeze_task(task);
spin_unlock_irq(&freezer->lock);
out:
rcu_read_unlock();
mutex_unlock(&freezer_mutex);
}
/**
......@@ -281,22 +255,24 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
struct css_task_iter it;
struct task_struct *task;
WARN_ON_ONCE(!rcu_read_lock_held());
spin_lock_irq(&freezer->lock);
lockdep_assert_held(&freezer_mutex);
if (!(freezer->state & CGROUP_FREEZING) ||
(freezer->state & CGROUP_FROZEN))
goto out_unlock;
return;
/* are all (live) children frozen? */
rcu_read_lock();
css_for_each_child(pos, css) {
struct freezer *child = css_freezer(pos);
if ((child->state & CGROUP_FREEZER_ONLINE) &&
!(child->state & CGROUP_FROZEN))
goto out_unlock;
!(child->state & CGROUP_FROZEN)) {
rcu_read_unlock();
return;
}
}
rcu_read_unlock();
/* are all tasks frozen? */
css_task_iter_start(css, &it);
......@@ -317,21 +293,29 @@ static void update_if_frozen(struct cgroup_subsys_state *css)
freezer->state |= CGROUP_FROZEN;
out_iter_end:
css_task_iter_end(&it);
out_unlock:
spin_unlock_irq(&freezer->lock);
}
static int freezer_read(struct seq_file *m, void *v)
{
struct cgroup_subsys_state *css = seq_css(m), *pos;
mutex_lock(&freezer_mutex);
rcu_read_lock();
/* update states bottom-up */
css_for_each_descendant_post(pos, css)
css_for_each_descendant_post(pos, css) {
if (!css_tryget(pos))
continue;
rcu_read_unlock();
update_if_frozen(pos);
rcu_read_lock();
css_put(pos);
}
rcu_read_unlock();
mutex_unlock(&freezer_mutex);
seq_puts(m, freezer_state_strs(css_freezer(css)->state));
seq_putc(m, '\n');
......@@ -373,7 +357,7 @@ static void freezer_apply_state(struct freezer *freezer, bool freeze,
unsigned int state)
{
/* also synchronizes against task migration, see freezer_attach() */
lockdep_assert_held(&freezer->lock);
lockdep_assert_held(&freezer_mutex);
if (!(freezer->state & CGROUP_FREEZER_ONLINE))
return;
......@@ -414,31 +398,29 @@ static void freezer_change_state(struct freezer *freezer, bool freeze)
* descendant will try to inherit its parent's FREEZING state as
* CGROUP_FREEZING_PARENT.
*/
mutex_lock(&freezer_mutex);
rcu_read_lock();
css_for_each_descendant_pre(pos, &freezer->css) {
struct freezer *pos_f = css_freezer(pos);
struct freezer *parent = parent_freezer(pos_f);
spin_lock_irq(&pos_f->lock);
if (!css_tryget(pos))
continue;
rcu_read_unlock();
if (pos_f == freezer) {
if (pos_f == freezer)
freezer_apply_state(pos_f, freeze,
CGROUP_FREEZING_SELF);
} else {
/*
* Our update to @parent->state is already visible
* which is all we need. No need to lock @parent.
* For more info on synchronization, see
* freezer_post_create().
*/
else
freezer_apply_state(pos_f,
parent->state & CGROUP_FREEZING,
CGROUP_FREEZING_PARENT);
}
spin_unlock_irq(&pos_f->lock);
rcu_read_lock();
css_put(pos);
}
rcu_read_unlock();
mutex_unlock(&freezer_mutex);
}
static int freezer_write(struct cgroup_subsys_state *css, struct cftype *cft,
......
......@@ -306,57 +306,138 @@ static int devcgroup_seq_show(struct seq_file *m, void *v)
}
/**
* may_access - verifies if a new exception is part of what is allowed
* by a dev cgroup based on the default policy +
* exceptions. This is used to make sure a child cgroup
* won't have more privileges than its parent or to
* verify if a certain access is allowed.
* @dev_cgroup: dev cgroup to be tested against
* @refex: new exception
* @behavior: behavior of the exception
* match_exception - iterates the exception list trying to find a complete match
* @exceptions: list of exceptions
* @type: device type (DEV_BLOCK or DEV_CHAR)
* @major: device file major number, ~0 to match all
* @minor: device file minor number, ~0 to match all
* @access: permission mask (ACC_READ, ACC_WRITE, ACC_MKNOD)
*
* It is considered a complete match if an exception is found that will
* contain the entire range of provided parameters.
*
* Return: true in case it matches an exception completely
*/
static bool may_access(struct dev_cgroup *dev_cgroup,
struct dev_exception_item *refex,
enum devcg_behavior behavior)
static bool match_exception(struct list_head *exceptions, short type,
u32 major, u32 minor, short access)
{
struct dev_exception_item *ex;
bool match = false;
rcu_lockdep_assert(rcu_read_lock_held() ||
lockdep_is_held(&devcgroup_mutex),
"device_cgroup::may_access() called without proper synchronization");
list_for_each_entry_rcu(ex, exceptions, list) {
if ((type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
continue;
if ((type & DEV_CHAR) && !(ex->type & DEV_CHAR))
continue;
if (ex->major != ~0 && ex->major != major)
continue;
if (ex->minor != ~0 && ex->minor != minor)
continue;
/* provided access cannot have more than the exception rule */
if (access & (~ex->access))
continue;
return true;
}
return false;
}
/**
* match_exception_partial - iterates the exception list trying to find a partial match
* @exceptions: list of exceptions
* @type: device type (DEV_BLOCK or DEV_CHAR)
* @major: device file major number, ~0 to match all
* @minor: device file minor number, ~0 to match all
* @access: permission mask (ACC_READ, ACC_WRITE, ACC_MKNOD)
*
* It is considered a partial match if an exception's range is found to
* contain *any* of the devices specified by provided parameters. This is
* used to make sure no extra access is being granted that is forbidden by
* any of the exception list.
*
* Return: true in case the provided range mat matches an exception completely
*/
static bool match_exception_partial(struct list_head *exceptions, short type,
u32 major, u32 minor, short access)
{
struct dev_exception_item *ex;
list_for_each_entry_rcu(ex, &dev_cgroup->exceptions, list) {
if ((refex->type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
list_for_each_entry_rcu(ex, exceptions, list) {
if ((type & DEV_BLOCK) && !(ex->type & DEV_BLOCK))
continue;
if ((refex->type & DEV_CHAR) && !(ex->type & DEV_CHAR))
if ((type & DEV_CHAR) && !(ex->type & DEV_CHAR))
continue;
if (ex->major != ~0 && ex->major != refex->major)
/*
* We must be sure that both the exception and the provided
* range aren't masking all devices
*/
if (ex->major != ~0 && major != ~0 && ex->major != major)
continue;
if (ex->minor != ~0 && ex->minor != refex->minor)
if (ex->minor != ~0 && minor != ~0 && ex->minor != minor)
continue;
if (refex->access & (~ex->access))
/*
* In order to make sure the provided range isn't matching
* an exception, all its access bits shouldn't match the
* exception's access bits
*/
if (!(access & ex->access))
continue;
match = true;
break;
return true;
}
return false;
}
/**
* verify_new_ex - verifies if a new exception is allowed by parent cgroup's permissions
* @dev_cgroup: dev cgroup to be tested against
* @refex: new exception
* @behavior: behavior of the exception's dev_cgroup
*
* This is used to make sure a child cgroup won't have more privileges
* than its parent
*/
static bool verify_new_ex(struct dev_cgroup *dev_cgroup,
struct dev_exception_item *refex,
enum devcg_behavior behavior)
{
bool match = false;
rcu_lockdep_assert(rcu_read_lock_held() ||
lockdep_is_held(&devcgroup_mutex),
"device_cgroup:verify_new_ex called without proper synchronization");
if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) {
if (behavior == DEVCG_DEFAULT_ALLOW) {
/* the exception will deny access to certain devices */
/*
* new exception in the child doesn't matter, only
* adding extra restrictions
*/
return true;
} else {
/* the exception will allow access to certain devices */
/*
* new exception in the child will add more devices
* that can be acessed, so it can't match any of
* parent's exceptions, even slightly
*/
match = match_exception_partial(&dev_cgroup->exceptions,
refex->type,
refex->major,
refex->minor,
refex->access);
if (match)
/*
* a new exception allowing access shouldn't
* match an parent's exception
*/
return false;
return true;
}
} else {
/* only behavior == DEVCG_DEFAULT_DENY allowed here */
/*
* Only behavior == DEVCG_DEFAULT_DENY allowed here, therefore
* the new exception will add access to more devices and must
* be contained completely in an parent's exception to be
* allowed
*/
match = match_exception(&dev_cgroup->exceptions, refex->type,
refex->major, refex->minor,
refex->access);
if (match)
/* parent has an exception that matches the proposed */
return true;
......@@ -378,7 +459,38 @@ static int parent_has_perm(struct dev_cgroup *childcg,
if (!parent)
return 1;
return may_access(parent, ex, childcg->behavior);
return verify_new_ex(parent, ex, childcg->behavior);
}
/**
* parent_allows_removal - verify if it's ok to remove an exception
* @childcg: child cgroup from where the exception will be removed
* @ex: exception being removed
*
* When removing an exception in cgroups with default ALLOW policy, it must
* be checked if removing it will give the child cgroup more access than the
* parent.
*
* Return: true if it's ok to remove exception, false otherwise
*/
static bool parent_allows_removal(struct dev_cgroup *childcg,
struct dev_exception_item *ex)
{
struct dev_cgroup *parent = css_to_devcgroup(css_parent(&childcg->css));
if (!parent)
return true;
/* It's always allowed to remove access to devices */
if (childcg->behavior == DEVCG_DEFAULT_DENY)
return true;
/*
* Make sure you're not removing part or a whole exception existing in
* the parent cgroup
*/
return !match_exception_partial(&parent->exceptions, ex->type,
ex->major, ex->minor, ex->access);
}
/**
......@@ -616,17 +728,21 @@ static int devcgroup_update_access(struct dev_cgroup *devcgroup,
switch (filetype) {
case DEVCG_ALLOW:
if (!parent_has_perm(devcgroup, &ex))
return -EPERM;
/*
* If the default policy is to allow by default, try to remove
* an matching exception instead. And be silent about it: we
* don't want to break compatibility
*/
if (devcgroup->behavior == DEVCG_DEFAULT_ALLOW) {
/* Check if the parent allows removing it first */
if (!parent_allows_removal(devcgroup, &ex))
return -EPERM;
dev_exception_rm(devcgroup, &ex);
return 0;
break;
}
if (!parent_has_perm(devcgroup, &ex))
return -EPERM;
rc = dev_exception_add(devcgroup, &ex);
break;
case DEVCG_DENY:
......@@ -704,18 +820,18 @@ static int __devcgroup_check_permission(short type, u32 major, u32 minor,
short access)
{
struct dev_cgroup *dev_cgroup;
struct dev_exception_item ex;
int rc;
memset(&ex, 0, sizeof(ex));
ex.type = type;
ex.major = major;
ex.minor = minor;
ex.access = access;
bool rc;
rcu_read_lock();
dev_cgroup = task_devcgroup(current);
rc = may_access(dev_cgroup, &ex, dev_cgroup->behavior);
if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW)
/* Can't match any of the exceptions, even partially */
rc = !match_exception_partial(&dev_cgroup->exceptions,
type, major, minor, access);
else
/* Need to match completely one exception to be allowed */
rc = match_exception(&dev_cgroup->exceptions, type, major,
minor, access);
rcu_read_unlock();
if (!rc)
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment