aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/devfreq/arm-memlat-mon.c146
-rw-r--r--drivers/soc/qcom/scm.c21
-rw-r--r--include/soc/qcom/scm.h4
3 files changed, 143 insertions, 28 deletions
diff --git a/drivers/devfreq/arm-memlat-mon.c b/drivers/devfreq/arm-memlat-mon.c
index b82d54d47fd4..33e50b1b8a0b 100644
--- a/drivers/devfreq/arm-memlat-mon.c
+++ b/drivers/devfreq/arm-memlat-mon.c
@@ -58,6 +58,7 @@ struct cpu_pmu_stats {
struct cpu_grp_info {
cpumask_t cpus;
+ unsigned long any_cpu_ev_mask;
unsigned int event_ids[NUM_EVENTS];
struct cpu_pmu_stats *cpustats;
struct memlat_hwmon hw;
@@ -67,6 +68,13 @@ struct memlat_mon_spec {
bool is_compute;
};
+struct ipi_data {
+ unsigned long cnts[NR_CPUS][NUM_EVENTS];
+ struct task_struct *waiter_task;
+ struct cpu_grp_info *cpu_grp;
+ atomic_t cpus_left;
+};
+
#define to_cpustats(cpu_grp, cpu) \
(&cpu_grp->cpustats[cpu - cpumask_first(&cpu_grp->cpus)])
#define to_devstats(cpu_grp, cpu) \
@@ -93,32 +101,84 @@ static unsigned long compute_freq(struct cpu_pmu_stats *cpustats,
}
#define MAX_COUNT_LIM 0xFFFFFFFFFFFFFFFF
-static inline unsigned long read_event(struct event_data *event)
+static unsigned long read_event(struct cpu_pmu_stats *cpustats, int event_id)
{
+ struct event_data *event = &cpustats->events[event_id];
unsigned long ev_count;
- u64 total, enabled, running;
+ u64 total;
- if (!event->pevent)
+ if (!event->pevent || perf_event_read_local(event->pevent, &total))
return 0;
- total = perf_event_read_value(event->pevent, &enabled, &running);
ev_count = total - event->prev_count;
event->prev_count = total;
return ev_count;
}
-static void read_perf_counters(int cpu, struct cpu_grp_info *cpu_grp)
+static void read_perf_counters(struct ipi_data *ipd, int cpu)
{
+ struct cpu_grp_info *cpu_grp = ipd->cpu_grp;
+ struct cpu_pmu_stats *cpustats = to_cpustats(cpu_grp, cpu);
+ int ev;
+
+ for (ev = 0; ev < NUM_EVENTS; ev++) {
+ if (!(cpu_grp->any_cpu_ev_mask & BIT(ev)))
+ ipd->cnts[cpu][ev] = read_event(cpustats, ev);
+ }
+}
+
+static void read_evs_ipi(void *info)
+{
+ int cpu = raw_smp_processor_id();
+ struct ipi_data *ipd = info;
+ struct task_struct *waiter;
+
+ read_perf_counters(ipd, cpu);
+
+ /*
+ * Wake up the waiter task if we're the final CPU. The ipi_data pointer
+ * isn't safe to dereference once cpus_left reaches zero, so the waiter
+ * task_struct pointer must be cached before that. Also defend against
+ * the extremely unlikely possibility that the waiter task will have
+ * exited by the time wake_up_process() is reached.
+ */
+ waiter = ipd->waiter_task;
+ get_task_struct(waiter);
+ if (atomic_fetch_andnot(BIT(cpu), &ipd->cpus_left) == BIT(cpu) &&
+ waiter->state != TASK_RUNNING)
+ wake_up_process(waiter);
+ put_task_struct(waiter);
+}
+
+static void read_any_cpu_events(struct ipi_data *ipd, unsigned long cpus)
+{
+ struct cpu_grp_info *cpu_grp = ipd->cpu_grp;
+ int cpu, ev;
+
+ if (!cpu_grp->any_cpu_ev_mask)
+ return;
+
+ for_each_cpu(cpu, to_cpumask(&cpus)) {
+ struct cpu_pmu_stats *cpustats = to_cpustats(cpu_grp, cpu);
+
+ for_each_set_bit(ev, &cpu_grp->any_cpu_ev_mask, NUM_EVENTS)
+ ipd->cnts[cpu][ev] = read_event(cpustats, ev);
+ }
+}
+
+static void compute_perf_counters(struct ipi_data *ipd, int cpu)
+{
+ struct cpu_grp_info *cpu_grp = ipd->cpu_grp;
struct cpu_pmu_stats *cpustats = to_cpustats(cpu_grp, cpu);
struct dev_stats *devstats = to_devstats(cpu_grp, cpu);
unsigned long cyc_cnt, stall_cnt;
- devstats->inst_count = read_event(&cpustats->events[INST_IDX]);
- devstats->mem_count = read_event(&cpustats->events[CM_IDX]);
- cyc_cnt = read_event(&cpustats->events[CYC_IDX]);
+ devstats->inst_count = ipd->cnts[cpu][INST_IDX];
+ devstats->mem_count = ipd->cnts[cpu][CM_IDX];
+ cyc_cnt = ipd->cnts[cpu][CYC_IDX];
devstats->freq = compute_freq(cpustats, cyc_cnt);
if (cpustats->events[STALL_CYC_IDX].pevent) {
- stall_cnt = read_event(&cpustats->events[STALL_CYC_IDX]);
+ stall_cnt = ipd->cnts[cpu][STALL_CYC_IDX];
stall_cnt = min(stall_cnt, cyc_cnt);
devstats->stall_pct = mult_frac(100, stall_cnt, cyc_cnt);
} else {
@@ -128,19 +188,69 @@ static void read_perf_counters(int cpu, struct cpu_grp_info *cpu_grp)
static unsigned long get_cnt(struct memlat_hwmon *hw)
{
- int cpu;
struct cpu_grp_info *cpu_grp = to_cpu_grp(hw);
+ unsigned long cpus_read_mask, tmp_mask;
+ call_single_data_t csd[NR_CPUS];
+ struct ipi_data ipd;
+ int cpu, this_cpu;
+
+ ipd.waiter_task = current;
+ ipd.cpu_grp = cpu_grp;
+
+ /* Dispatch asynchronous IPIs to each CPU to read the perf events */
+ cpus_read_lock();
+ preempt_disable();
+ this_cpu = raw_smp_processor_id();
+ cpus_read_mask = *cpumask_bits(&cpu_grp->cpus);
+ tmp_mask = cpus_read_mask & ~BIT(this_cpu);
+ ipd.cpus_left = (atomic_t)ATOMIC_INIT(tmp_mask);
+ for_each_cpu(cpu, to_cpumask(&tmp_mask)) {
+ /*
+ * Some SCM calls take very long (20+ ms), so the IPI could lag
+ * on the CPU running the SCM call. Skip offline CPUs too.
+ */
+ csd[cpu].flags = 0;
+ if (under_scm_call(cpu) ||
+ generic_exec_single(cpu, &csd[cpu], read_evs_ipi, &ipd))
+ cpus_read_mask &= ~BIT(cpu);
+ }
+ cpus_read_unlock();
+ /* Read this CPU's events while the IPIs run */
+ if (cpus_read_mask & BIT(this_cpu))
+ read_perf_counters(&ipd, this_cpu);
+ preempt_enable();
+
+ /* Bail out if there weren't any CPUs available */
+ if (!cpus_read_mask)
+ return 0;
+
+ /* Read any any-CPU events while the IPIs run */
+ read_any_cpu_events(&ipd, cpus_read_mask);
+
+ /* Clear out CPUs which were skipped */
+ atomic_andnot(cpus_read_mask ^ tmp_mask, &ipd.cpus_left);
/*
- * Some of SCM call is very heavy(+20ms) so perf IPI could
- * be stuck on the CPU which contributes long latency.
+ * Wait until all the IPIs are done reading their events, and compute
+ * each finished CPU's results while waiting since some CPUs may finish
+ * reading their events faster than others.
*/
- if (under_scm_call()) {
- return 0;
+ for (tmp_mask = cpus_read_mask;;) {
+ unsigned long cpus_done, cpus_left;
+
+ set_current_state(TASK_UNINTERRUPTIBLE);
+ cpus_left = (unsigned int)atomic_read(&ipd.cpus_left);
+ if ((cpus_done = cpus_left ^ tmp_mask)) {
+ for_each_cpu(cpu, to_cpumask(&cpus_done))
+ compute_perf_counters(&ipd, cpu);
+ if (!cpus_left)
+ break;
+ tmp_mask = cpus_left;
+ } else {
+ schedule();
+ }
}
-
- for_each_cpu(cpu, &cpu_grp->cpus)
- read_perf_counters(cpu, cpu_grp);
+ __set_current_state(TASK_RUNNING);
return 0;
}
@@ -217,6 +327,8 @@ static int set_events(struct cpu_grp_info *cpu_grp, int cpu)
goto err_out;
cpustats->events[i].pevent = pevent;
perf_event_enable(pevent);
+ if (cpumask_equal(&pevent->readable_on_cpus, &CPU_MASK_ALL))
+ cpu_grp->any_cpu_ev_mask |= BIT(i);
}
kfree(attr);
diff --git a/drivers/soc/qcom/scm.c b/drivers/soc/qcom/scm.c
index b067c9baf4e2..fc36e8db1ab5 100644
--- a/drivers/soc/qcom/scm.c
+++ b/drivers/soc/qcom/scm.c
@@ -36,7 +36,7 @@
#define SCM_EBUSY -55
#define SCM_V2_EBUSY -12
-static atomic_t scm_call_count = ATOMIC_INIT(0);
+static DEFINE_PER_CPU(atomic_t, scm_call_count);
static DEFINE_MUTEX(scm_lock);
/*
@@ -433,11 +433,12 @@ static int ___scm_call_armv8_64(u64 x0, u64 x1, u64 x2, u64 x3, u64 x4, u64 x5,
static int __scm_call_armv8_64(u64 x0, u64 x1, u64 x2, u64 x3, u64 x4, u64 x5,
u64 *ret1, u64 *ret2, u64 *ret3)
{
+ atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
int ret;
- atomic_inc(&scm_call_count);
+ atomic_inc(cnt);
ret = ___scm_call_armv8_64(x0, x1, x2, x3, x4, x5, ret1, ret2, ret3);
- atomic_dec(&scm_call_count);
+ atomic_dec(cnt);
return ret;
}
@@ -495,11 +496,12 @@ static int ___scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
u64 *ret1, u64 *ret2, u64 *ret3)
{
+ atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
int ret;
- atomic_inc(&scm_call_count);
+ atomic_inc(cnt);
ret = ___scm_call_armv8_32(w0, w1, w2, w3, w4, w5, ret1, ret2, ret3);
- atomic_dec(&scm_call_count);
+ atomic_dec(cnt);
return ret;
}
@@ -557,11 +559,12 @@ static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
static int __scm_call_armv8_32(u32 w0, u32 w1, u32 w2, u32 w3, u32 w4, u32 w5,
u64 *ret1, u64 *ret2, u64 *ret3)
{
+ atomic_t *cnt = per_cpu_ptr(&scm_call_count, raw_smp_processor_id());
int ret;
- atomic_inc(&scm_call_count);
+ atomic_inc(cnt);
ret = ___scm_call_armv8_32(w0, w1, w2, w3, w4, w5, ret1, ret2, ret3);
- atomic_dec(&scm_call_count);
+ atomic_dec(cnt);
return ret;
}
@@ -1352,7 +1355,7 @@ inline int scm_enable_mem_protection(void)
#endif
EXPORT_SYMBOL(scm_enable_mem_protection);
-bool under_scm_call(void)
+bool under_scm_call(int cpu)
{
- return atomic_read(&scm_call_count);
+ return atomic_read(per_cpu_ptr(&scm_call_count, cpu));
}
diff --git a/include/soc/qcom/scm.h b/include/soc/qcom/scm.h
index e8d47986972a..5e80139a9d84 100644
--- a/include/soc/qcom/scm.h
+++ b/include/soc/qcom/scm.h
@@ -124,7 +124,7 @@ struct scm_hdcp_req {
};
extern struct mutex scm_lmh_lock;
-extern bool under_scm_call(void);
+extern bool under_scm_call(int cpu);
#else
static inline int scm_call2(u32 cmd_id, struct scm_desc *desc)
@@ -186,7 +186,7 @@ static inline int scm_enable_mem_protection(void)
{
return 0;
}
-extern bool under_scm_call(void)
+extern bool under_scm_call(int cpu)
{
return false;
}