7 files changed, 202 insertions, 15 deletions
diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile
index e9adba01c456..5bd0fbecc37b 100644
--- a/kernel/sched/Makefile
+++ b/kernel/sched/Makefile
@@ -26,6 +26,7 @@ obj-$(CONFIG_SCHED_AUTOGROUP) += autogroup.o
 obj-$(CONFIG_SCHEDSTATS) += stats.o
 obj-$(CONFIG_SCHED_DEBUG) += debug.o
 obj-$(CONFIG_SCHED_TUNE) += tune.o
+obj-$(CONFIG_SCHED_TUNE_DUMMY) += tune_dummy.o
 obj-$(CONFIG_CGROUP_CPUACCT) += cpuacct.o
 obj-$(CONFIG_CPU_FREQ) += cpufreq.o
 obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2e3c650573bb..5f832ae982e6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3544,6 +3544,50 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev,
 	prepare_arch_switch(next);
 }
 
+void release_task_stack(struct task_struct *tsk);
+static void task_async_free(struct work_struct *work)
+{
+	struct task_struct *t = container_of(work, typeof(*t), async_free.work);
+	bool free_stack = READ_ONCE(t->async_free.free_stack);
+
+	atomic_set(&t->async_free.running, 0);
+
+	if (free_stack) {
+		release_task_stack(t);
+		put_task_struct(t);
+	} else {
+		__put_task_struct(t);
+	}
+}
+
+static void finish_task_switch_dead(struct task_struct *prev)
+{
+	if (atomic_cmpxchg(&prev->async_free.running, 0, 1)) {
+		put_task_stack(prev);
+		put_task_struct(prev);
+		return;
+	}
+
+	if (atomic_dec_and_test(&prev->stack_refcount)) {
+		prev->async_free.free_stack = true;
+	} else if (atomic_dec_and_test(&prev->usage)) {
+		prev->async_free.free_stack = false;
+	} else {
+		atomic_set(&prev->async_free.running, 0);
+		return;
+	}
+
+	INIT_WORK(&prev->async_free.work, task_async_free);
+	queue_work(system_unbound_wq, &prev->async_free.work);
+}
+
+static void mmdrop_async_free(struct work_struct *work)
+{
+	struct mm_struct *mm = container_of(work, typeof(*mm), async_put_work);
+
+	__mmdrop(mm);
+}
+
 /**
  * finish_task_switch - clean up after a task-switch
  * @prev: the thread we just switched away from.
@@ -3617,8 +3661,10 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 	kcov_finish_switch(current);
 
 	fire_sched_in_preempt_notifiers(current);
-	if (mm)
-		mmdrop(mm);
+	if (mm && atomic_dec_and_test(&mm->mm_count)) {
+		INIT_WORK(&mm->async_put_work, mmdrop_async_free);
+		queue_work(system_unbound_wq, &mm->async_put_work);
+	}
 	if (unlikely(prev_state  == TASK_DEAD)) {
 			if (prev->sched_class->task_dead)
 				prev->sched_class->task_dead(prev);
@@ -3629,11 +3675,7 @@ static struct rq *finish_task_switch(struct task_struct *prev)
 			 */
 			kprobe_flush_task(prev);
 
-			/* Task is done with its stack. */
-			put_task_stack(prev);
-
-			put_task_struct(prev);
-
+			finish_task_switch_dead(prev);
 	}
 
 	tick_nohz_task_switch();
@@ -5002,7 +5044,8 @@ static void __setscheduler_params(struct task_struct *p,
 	if (policy == SETPARAM_POLICY)
 		policy = p->policy;
 
-	p->policy = policy;
+	/* Replace SCHED_FIFO with SCHED_RR to reduce latency */
+	p->policy = policy == SCHED_FIFO ? SCHED_RR : policy;
 
 	if (dl_policy(policy))
 		__setparam_dl(p, attr);
diff --git a/kernel/sched/cpufreq.c b/kernel/sched/cpufreq.c
index dbc51442ecbc..c2129347f793 100644
--- a/kernel/sched/cpufreq.c
+++ b/kernel/sched/cpufreq.c
@@ -8,6 +8,7 @@
  * it under the terms of the GNU General Public License version 2 as
  * published by the Free Software Foundation.
  */
+#include <linux/cpufreq.h>
 
 #include "sched.h"
 
@@ -61,3 +62,19 @@ void cpufreq_remove_update_util_hook(int cpu)
 	rcu_assign_pointer(per_cpu(cpufreq_update_util_data, cpu), NULL);
 }
 EXPORT_SYMBOL_GPL(cpufreq_remove_update_util_hook);
+
+/**
+ * cpufreq_can_do_remote_dvfs - Check if cpufreq policy can be updated.
+ * @policy: cpufreq policy to check.
+ *
+ * Return 'true' if:
+ * - the local and remote CPUs share @policy,
+ * - dvfs_possible_from_any_cpu is set in @policy and the local CPU is not going
+ *   offline (in which case it is not expected to run cpufreq updates any more).
+ */
+bool cpufreq_can_do_remote_dvfs(struct cpufreq_policy *policy)
+{
+	return cpumask_test_cpu(smp_processor_id(), policy->cpus) ||
+		(policy->dvfs_possible_from_any_cpu &&
+		 rcu_dereference_sched(*this_cpu_ptr(&cpufreq_update_util_data)));
+}
diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index d92acbfece89..83b58e7fa864 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -108,12 +108,10 @@ static bool sugov_should_update_freq(struct sugov_policy *sg_policy, u64 time)
 	 * by the hardware, as calculating the frequency is pointless if
 	 * we cannot in fact act on it.
 	 *
-	 * For the slow switching platforms, the kthread is always scheduled on
-	 * the right set of CPUs and any CPU can find the next frequency and
-	 * schedule the kthread.
+	 * This is needed on the slow switching platforms too to prevent CPUs
+	 * going offline from leaving stale IRQ work items behind.
 	 */
-	if (sg_policy->policy->fast_switch_enabled &&
-	    !cpufreq_can_do_remote_dvfs(sg_policy->policy))
+	if (!cpufreq_can_do_remote_dvfs(sg_policy->policy))
 		return false;
 
 	if (unlikely(sg_policy->need_freq_update)) {
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index f676735e500a..0b521c46a86e 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -7822,6 +7822,13 @@ static inline int find_best_target(struct task_struct *p, int *backup_cpu,
 					continue;
 
 				/*
+				 * Skip searching for active CPU for tasks have
+				 * high priority & prefer_high_cap.
+				 */
+				if (prefer_high_cap && p->prio <= DEFAULT_PRIO)
+					continue;
+
+				/*
 				 * Case A.2: Target ACTIVE CPU
 				 * Favor CPUs with max spare capacity.
 				 */
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index c8f70ea89099..63b871ddce22 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -196,7 +196,7 @@ static void cpuidle_idle_call(void)
 		 */
 		next_state = cpuidle_select(drv, dev, &stop_tick);
 
-		if (stop_tick || tick_nohz_tick_stopped())
+		if (stop_tick)
 			tick_nohz_idle_stop_tick();
 		else
 			tick_nohz_idle_retain_tick();
@@ -239,7 +239,6 @@ static void do_idle(void)
 	 */
 
 	__current_set_polling();
-	quiet_vmstat();
 	tick_nohz_idle_enter();
 
 	while (!need_resched()) {
diff --git a/kernel/sched/tune_dummy.c b/kernel/sched/tune_dummy.c
new file mode 100644
index 000000000000..271e7d9cc2f3
--- /dev/null
+++ b/kernel/sched/tune_dummy.c
@@ -0,0 +1,122 @@
+#include <linux/cgroup.h>
+#include "sched.h"
+
+#define BOOSTGROUPS_COUNT 5
+
+struct schedtune {
+	/* SchedTune CGroup subsystem */
+	struct cgroup_subsys_state css;
+
+	/* Boost value for tasks on that SchedTune CGroup */
+	int boost;
+
+	/* Hint to bias scheduling of tasks on that SchedTune CGroup
+	 * towards idle CPUs */
+	int prefer_idle;
+};
+
+static struct schedtune
+root_schedtune = {
+	.boost	= 0,
+	.prefer_idle = 0,
+};
+
+static struct schedtune *allocated_group[BOOSTGROUPS_COUNT] = {
+	&root_schedtune,
+	NULL,
+};
+
+static inline struct schedtune *css_st(struct cgroup_subsys_state *css)
+{
+	return container_of(css, struct schedtune, css);
+}
+
+static u64
+prefer_idle_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return 0;
+}
+
+static int
+prefer_idle_write(struct cgroup_subsys_state *css, struct cftype *cft,
+	    u64 prefer_idle)
+{
+	return 0;
+}
+
+static s64
+boost_read(struct cgroup_subsys_state *css, struct cftype *cft)
+{
+	return 0;
+}
+
+static int
+boost_write(struct cgroup_subsys_state *css, struct cftype *cft,
+	    s64 boost)
+{
+	return 0;
+}
+
+static struct cftype files[] = {
+	{
+		.name = "boost",
+		.read_s64 = boost_read,
+		.write_s64 = boost_write,
+	},
+	{
+		.name = "prefer_idle",
+		.read_u64 = prefer_idle_read,
+		.write_u64 = prefer_idle_write,
+	},
+	{ }	/* terminate */
+};
+
+static struct cgroup_subsys_state *
+schedtune_css_alloc(struct cgroup_subsys_state *parent_css)
+{
+	struct schedtune *st;
+	int idx;
+
+	if (!parent_css)
+		return &root_schedtune.css;
+
+	/* Allow only single level hierachies */
+	if (parent_css != &root_schedtune.css) {
+		pr_err("Nested SchedTune boosting groups not allowed\n");
+		return ERR_PTR(-ENOMEM);
+	}
+
+	/* Allow only a limited number of boosting groups */
+	for (idx = 1; idx < BOOSTGROUPS_COUNT; ++idx)
+		if (!allocated_group[idx])
+			break;
+	if (idx == BOOSTGROUPS_COUNT) {
+		pr_err("Trying to create more than %d SchedTune boosting groups\n",
+		       BOOSTGROUPS_COUNT);
+		return ERR_PTR(-ENOSPC);
+	}
+
+	st = kzalloc(sizeof(*st), GFP_KERNEL);
+	if (!st)
+		goto out;
+
+	return &st->css;
+
+out:
+	return ERR_PTR(-ENOMEM);
+}
+
+static void
+schedtune_css_free(struct cgroup_subsys_state *css)
+{
+	struct schedtune *st = css_st(css);
+
+	kfree(st);
+}
+
+struct cgroup_subsys schedtune_cgrp_subsys = {
+	.css_alloc	= schedtune_css_alloc,
+	.css_free	= schedtune_css_free,
+	.legacy_cftypes	= files,
+	.early_init	= 1,
+};