3 files changed, 143 insertions, 28 deletions
diff --git a/drivers/devfreq/arm-memlat-mon.c b/drivers/devfreq/arm-memlat-mon.c
index b82d54d47fd4..33e50b1b8a0b 100644
--- a/drivers/devfreq/arm-memlat-mon.c
+++ b/drivers/devfreq/arm-memlat-mon.c
@@ -58,6 +58,7 @@ struct cpu_pmu_stats {
 
 struct cpu_grp_info {
 	cpumask_t cpus;
+	unsigned long any_cpu_ev_mask;
 	unsigned int event_ids[NUM_EVENTS];
 	struct cpu_pmu_stats *cpustats;
 	struct memlat_hwmon hw;
@@ -67,6 +68,13 @@ struct memlat_mon_spec {
 	bool is_compute;
 };
 
+struct ipi_data {
+	unsigned long cnts[NR_CPUS][NUM_EVENTS];
+	struct task_struct *waiter_task;
+	struct cpu_grp_info *cpu_grp;
+	atomic_t cpus_left;
+};
+
 #define to_cpustats(cpu_grp, cpu) \
 	(&cpu_grp->cpustats[cpu - cpumask_first(&cpu_grp->cpus)])
 #define to_devstats(cpu_grp, cpu) \
@@ -93,32 +101,84 @@ static unsigned long compute_freq(struct cpu_pmu_stats *cpustats,
 }
 
 #define MAX_COUNT_LIM 0xFFFFFFFFFFFFFFFF
-static inline unsigned long read_event(struct event_data *event)
+static unsigned long read_event(struct cpu_pmu_stats *cpustats, int event_id)
 {
+	struct event_data *event = &cpustats->events[event_id];
 	unsigned long ev_count;
-	u64 total, enabled, running;
+	u64 total;
 
-	if (!event->pevent)
+	if (!event->pevent || perf_event_read_local(event->pevent, &total))
 		return 0;
 
-	total = perf_event_read_value(event->pevent, &enabled, &running);
 	ev_count = total - event->prev_count;
 	event->prev_count = total;
 	return ev_count;
 }
 
-static void read_perf_counters(int cpu, struct cpu_grp_info *cpu_grp)
+static void read_perf_counters(struct ipi_data *ipd, int cpu)
 {
+	struct cpu_grp_info *cpu_grp = ipd->cpu_grp;
+	struct cpu_pmu_stats *cpustats = to_cpustats(cpu_grp, cpu);
+	int ev;
+
+	for (ev = 0; ev < NUM_EVENTS; ev++) {
+		if (!(cpu_grp->any_cpu_ev_mask & BIT(ev)))
+			ipd->cnts[cpu][ev] = read_event(cpustats, ev);
+	}
+}
+
+static void read_evs_ipi(void *info)
+{
+	int cpu = raw_smp_processor_id();
+	struct ipi_data *ipd = info;
+	struct task_struct *waiter;
+
+	read_perf_counters(ipd, cpu);
+
+	/*
+	 * Wake up the waiter task if we're the final CPU. The ipi_data pointer
+	 * isn't safe to dereference once cpus_left reaches zero, so the waiter
+	 * task_struct pointer must be cached before that. Also defend against
+	 * the extremely unlikely possibility that the waiter task will have
+	 * exited by the time wake_up_process() is reached.
+	 */
+	waiter = ipd->waiter_task;
+	get_task_struct(waiter);
+	if (atomic_fetch_andnot(BIT(cpu), &ipd->cpus_left) == BIT(cpu) &&
+	    waiter->state != TASK_RUNNING)
+		wake_up_process(waiter);
+	put_task_struct(waiter);
+}
+
+static void read_any_cpu_events(struct ipi_data *ipd, unsigned long cpus)
+{
+	struct cpu_grp_info *cpu_grp = ipd->cpu_grp;
+	int cpu, ev;
+
+	if (!cpu_grp->any_cpu_ev_mask)
+		return;
+
+	for_each_cpu(cpu, to_cpumask(&cpus)) {
+		struct cpu_pmu_stats *cpustats = to_cpustats(cpu_grp, cpu);
+
+		for_each_set_bit(ev, &cpu_grp->any_cpu_ev_mask, NUM_EVENTS)
+			ipd->cnts[cpu][ev] = read_event(cpustats, ev);
+	}
+}
+
+static void compute_perf_counters(struct ipi_data *ipd, int cpu)
+{
+	struct cpu_grp_info *cpu_grp = ipd->cpu_grp;
 	struct cpu_pmu_stats *cpustats = to_cpustats(cpu_grp, cpu);
 	struct dev_stats *devstats = to_devstats(cpu_grp, cpu);
 	unsigned long cyc_cnt, stall_cnt;
 
-	devstats->inst_count = read_event(&cpustats->events[INST_IDX]);
-	devstats->mem_count = read_event(&cpustats->events[CM_IDX]);
-	cyc_cnt = read_event(&cpustats->events[CYC_IDX]);
+	devstats->inst_count = ipd->cnts[cpu][INST_IDX];
+	devstats->mem_count = ipd->cnts[cpu][CM_IDX];
+	cyc_cnt = ipd->cnts[cpu][CYC_IDX];
 	devstats->freq = compute_freq(cpustats, cyc_cnt);
 	if (cpustats->events[STALL_CYC_IDX].pevent) {
-		stall_cnt = read_event(&cpustats->events[STALL_CYC_IDX]);
+		stall_cnt = ipd->cnts[cpu][STALL_CYC_IDX];
 		stall_cnt = min(stall_cnt, cyc_cnt);
 		devstats->stall_pct = mult_frac(100, stall_cnt, cyc_cnt);
 	} else {
@@ -128,19 +188,69 @@ static void read_perf_counters(int cpu, struct cpu_grp_info *cpu_grp)
 
 static unsigned long get_cnt(struct memlat_hwmon *hw)
 {
-	int cpu;
 	struct cpu_grp_info *cpu_grp = to_cpu_grp(hw);
+	unsigned long cpus_read_mask, tmp_mask;
+	call_single_data_t csd[NR_CPUS];
+	struct ipi_data ipd;
+	int cpu, this_cpu;
+
+	ipd.waiter_task = current;
+	ipd.cpu_grp = cpu_grp;
+
+	/* Dispatch asynchronous IPIs to each CPU to read the perf events */
+	cpus_read_lock();
+	preempt_disable();
+	this_cpu = raw_smp_processor_id();
+	cpus_read_mask = *cpumask_bits(&cpu_grp->cpus);
+	tmp_mask = cpus_read_mask & ~BIT(this_cpu);
+	ipd.cpus_left = (atomic_t)ATOMIC_INIT(tmp_mask);
+	for_each_cpu(cpu, to_cpumask(&tmp_mask)) {
+		/*
+		 * Some SCM calls take very long (20+ ms), so the IPI could lag
+		 * on the CPU running the SCM call. Skip offline CPUs too.
+		 */
+		csd[cpu].flags = 0;
+		if (under_scm_call(cpu) ||
+		    generic_exec_single(cpu, &csd[cpu], read_evs_ipi, &ipd))
+			cpus_read_mask &= ~BIT(cpu);
+	}
+	cpus_read_unlock();
+	/* Read this CPU's events while the IPIs run */
+	if (cpus_read_mask & BIT(this_cpu))
+		read_perf_counters(&ipd, this_cpu);
+	preempt_enable();
+
+	/* Bail out if there weren't any CPUs available */
+	if (!cpus_read_mask)
+		return 0;
+
+	/* Read any any-CPU events while the IPIs run */
+	read_any_cpu_events(&ipd, cpus_read_mask);
+
+	/* Clear out CPUs which were skipped */
+	atomic_andnot(cpus_read_mask ^ tmp_mask, &ipd.cpus_left);
 
 	/*
-	 * Some of SCM call is very heavy(+20ms) so perf IPI could
-	 * be stuck on the CPU which contributes long latency.
+	 * Wait until all the IPIs are done reading their events, and compute
+	 * each finished CPU's results while waiting since some CPUs may finish
+	 * reading their events faster than others.
 	 */
-	if (under_scm_call()) {
-		return 0;
+	for (tmp_mask = cpus_read_mask;;) {
+		unsigned long cpus_done, cpus_left;
+
+		set_current_state(TASK_UNINTERRUPTIBLE);
+		cpus_left = (unsigned int)atomic_read(&ipd.cpus_left);
+		if ((cpus_done = cpus_left ^ tmp_mask)) {
+			for_each_cpu(cpu, to_cpumask(&cpus_done))
+				compute_perf_counters(&ipd, cpu);
+			if (!cpus_left)
+				break;
+			tmp_mask = cpus_left;
+		} else {
+			schedule();
+		}
 	}
-
-	for_each_cpu(cpu, &cpu_grp->cpus)
-		read_perf_counters(cpu, cpu_grp);
+	__set_current_state(TASK_RUNNING);
 
 	return 0;
 }
@@ -217,6 +327,8 @@ static int set_events(struct cpu_grp_info *cpu_grp, int cpu)
 			goto err_out;
 		cpustats->events[i].pevent = pevent;
 		perf_event_enable(pevent);
+		if (cpumask_equal(&pevent->readable_on_cpus, &CPU_MASK_ALL))
+			cpu_grp->any_cpu_ev_mask |= BIT(i);
 	}
 
 	kfree(attr);
diff --git a/drivers/soc/qcom/scm.c b/drivers/soc/qcom/scm.c
index b067c9baf4e2..fc36e8db1ab5 100644
--- a/drivers/soc/qcom/scm.c
+++ b/drivers/soc/qcom/scm.c
@@ -36,7 +36,7 @@
 #define SCM_EBUSY		-55
 #define SCM_V2_EBUSY		-12
 
-static atomic_t scm_call_count = ATOMIC_INIT(0);
+static DEFINE_PER_CPU(atomic_t, scm_call_count);
 static DEFINE_MUTEX(scm_lock);
 
 /*
@@ -433,11 +433,12 @@ static int ___scm_call_armv8_64(u64 x0, u64 x1, u64 x2, u64 x3, u64 x4, u64 x5,
 static int __scm_call_armv8_64(u64 x0, u64 x1, u64 x2, u64 x3, u64 x4, u64 x5,
 				u64 *ret1, u64 *ret2, u64 *ret3)
 {
+	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
 	int ret;
 
-	atomic_inc(&scm_call_count);
+	atomic_inc(cnt);
 	ret = ___scm_call_armv8_64(x0, x1, x2, x3, x4, x5, ret1, ret2, ret3);
-	atomic_dec(&scm_call_count);
+	atomic_dec(cnt);
 
 	return ret;
 }
@@ -495,11 +496,12 @@ static int ___scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 				u64 *ret1, u64 *ret2, u64 *ret3)
 {
+	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
 	int ret;
 
-	atomic_inc(&scm_call_count);
+	atomic_inc(cnt);
 	ret = ___scm_call_armv8_32(w0, w1, w2, w3, w4, w5, ret1, ret2, ret3);
-	atomic_dec(&scm_call_count);
+	atomic_dec(cnt);
 
 	return ret;
 }
@@ -557,11 +559,12 @@ static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
 				u64 *ret1, u64 *ret2, u64 *ret3)
 {
+	atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
 	int ret;
 
-	atomic_inc(&scm_call_count);
+	atomic_inc(cnt);
 	ret = ___scm_call_armv8_32(w0, w1, w2, w3, w4, w5, ret1, ret2, ret3);
-	atomic_dec(&scm_call_count);
+	atomic_dec(cnt);
 
 	return ret;
 }
@@ -1352,7 +1355,7 @@ inline int scm_enable_mem_protection(void)
 #endif
 EXPORT_SYMBOL(scm_enable_mem_protection);
 
-bool under_scm_call(void)
+bool under_scm_call(int cpu)
 {
-	return atomic_read(&scm_call_count);
+	return atomic_read(per_cpu_ptr(&scm_call_count, cpu));
 }
diff --git a/include/soc/qcom/scm.h b/include/soc/qcom/scm.h
index e8d47986972a..5e80139a9d84 100644
--- a/include/soc/qcom/scm.h
+++ b/include/soc/qcom/scm.h
@@ -124,7 +124,7 @@ struct scm_hdcp_req {
 };
 
 extern struct mutex scm_lmh_lock;
-extern bool under_scm_call(void);
+extern bool under_scm_call(int cpu);
 #else
 
 static inline int scm_call2(u32 cmd_id, struct scm_desc *desc)
@@ -186,7 +186,7 @@ static inline int scm_enable_mem_protection(void)
 {
 	return 0;
 }
-extern bool under_scm_call(void)
+extern bool under_scm_call(int cpu)
 {
 	return false;
 }