QCOM安卓平台子系统复位原因记录分析

参考LA msm-4.14 kernel,先看下在drivers/soc/qcom/subsys-pil-tz.c里记录失败原因的接口:

static void log_failure_reason(const struct pil_tz_data *d)
{
    size_t size;
    char *smem_reason, reason[MAX_SSR_REASON_LEN];
    const char *name = d->subsys_desc.name;

    if (d->smem_id == -1)
        return;

    smem_reason = qcom_smem_get(QCOM_SMEM_HOST_ANY, d->smem_id, &size); //tj
    if (IS_ERR(smem_reason) || !size) {
        pr_err("%s SFR: (unknown, qcom_smem_get failed).\n",
                                    name);
        return;
    }
    if (!smem_reason[0]) {
        pr_err("%s SFR: (unknown, empty string found).\n", name);
        return;
    }

    strlcpy(reason, smem_reason, min(size, (size_t)MAX_SSR_REASON_LEN)); //tj
    pr_err("%s subsystem failure reason: %s.\n", name, reason);
}

调用 qcom_smem_get() 获得subsystem复位原因,记录在栈 reason 中。

ok,既然是stack var,func结束就丢了。如何保存这个 reason ?因为我们不可能老是连着串口,有的crash不一定能复现,pstore有时没有记录,尤其是用户退货机分析。

在调试阶段,我想一个有效方法可以把这个 reason 放到保留memory里。对退货机,就要保存到存储上了,直接从fs层写到block设备上就行了。看了下pstore next加了block支持(全志贡献的),回头看看。这里我们主要看下如何保存到存储上。

直接在这个接口里加blk读写是否可以?我们来看代码上下文。

static irqreturn_t subsys_err_fatal_intr_handler (int irq, void *dev_id)
{
        struct pil_tz_data *d = subsys_to_data(dev_id);

        pr_err("Fatal error on %s!\n", d->subsys_desc.name);
        if (subsys_get_crash_status(d->subsys)) {
                pr_err("%s: Ignoring error fatal, restart in progress\n",
                                                        d->subsys_desc.name);
                return IRQ_HANDLED;
        }
        subsys_set_crash_status(d->subsys, CRASH_STATUS_ERR_FATAL);
        log_failure_reason(d); //tj: here
        subsystem_restart_dev(d->subsys);

        return IRQ_HANDLED;
}
static irqreturn_t subsys_wdog_bite_irq_handler(int irq, void *dev_id)
{
        struct pil_tz_data *d = subsys_to_data(dev_id);

        if (subsys_get_crash_status(d->subsys))
                return IRQ_HANDLED;
        pr_err("Watchdog bite received from %s!\n", d->subsys_desc.name);

        if (d->subsys_desc.system_debug)
                panic("%s: System ramdump requested. Triggering device restart!\n",
                                                        __func__);
        subsys_set_crash_status(d->subsys, CRASH_STATUS_WDOG_BITE);
        log_failure_reason(d); //tj: here
        subsystem_restart_dev(d->subsys);

        return IRQ_HANDLED;
}
static void clear_wdog(struct pil_tz_data *d)
{
        /* Check crash status to know if device is restarting*/
        if (!subsys_get_crash_status(d->subsys)) {
                pr_err("wdog bite received from %s!\n", d->subsys_desc.name);
                __raw_writel(BIT(d->bits_arr[ERR_READY]), d->irq_clear);
                subsys_set_crash_status(d->subsys, CRASH_STATUS_WDOG_BITE);
                log_failure_reason(d); //tj:here
                subsystem_restart_dev(d->subsys);
        }
}
static irqreturn_t subsys_generic_handler(int irq, void *dev_id)
{
        struct pil_tz_data *d = subsys_to_data(dev_id);
        uint32_t status_val, err_value;

        err_value =  __raw_readl(d->err_status_spare);
        status_val = __raw_readl(d->irq_status);

        if ((status_val & BIT(d->bits_arr[ERR_READY])) && !err_value)
                clear_err_ready(d);

        if ((status_val & BIT(d->bits_arr[ERR_READY])) &&
                                        err_value == 0x44554d50)
                clear_wdog(d);

ok, 是在中断服务里call log_failure_reason()

static int pil_tz_driver_probe(struct platform_device *pdev)
{
    ...
        if (of_property_read_bool(pdev->dev.of_node,
                                        "qcom,pil-generic-irq-handler")) {
                d->subsys_desc.generic_handler = subsys_generic_handler;
    ...
        } else {
                d->subsys_desc.err_fatal_handler =
                                                subsys_err_fatal_intr_handler;
                d->subsys_desc.wdog_bite_handler = subsys_wdog_bite_irq_handler;
}

一般都走 Fatal error or Watchdog bite received

首先设置crash状态:

subsys_set_crash_status(d->subsys, CRASH_STATUS_ERR_FATAL);
subsys_set_crash_status(d->subsys, CRASH_STATUS_WDOG_BITE);
void subsys_set_crash_status(struct subsys_device *dev,
                                enum crash_status crashed)
{
        dev->crashed = crashed;
}

然后call log_failure_reason() 记录,中断服务最后call subsystem_restart_dev() ,看下这个restart接口:

int subsystem_restart_dev(struct subsys_device *dev)
{
        const char *name;

        if (!get_device(&dev->dev))
                return -ENODEV;

        if (!try_module_get(dev->owner)) {
                put_device(&dev->dev);
                return -ENODEV;
        }

        name = dev->desc->name;

        send_early_notifications(dev->early_notify);

        /*
         * If a system reboot/shutdown is underway, ignore subsystem errors.
         * However, print a message so that we know that a subsystem behaved
         * unexpectedly here.
         */
        if (system_state == SYSTEM_RESTART
                || system_state == SYSTEM_POWER_OFF) {
                pr_err("%s crashed during a system poweroff/shutdown.\n", name);
                return -EBUSY; //tj
        }

        pr_info("Restart sequence requested for %s, restart_level = %s.\n",
                name, restart_levels[dev->restart_level]);

        if (disable_restart_work == DISABLE_SSR) {
                pr_warn("subsys-restart: Ignoring restart request for %s\n",
                                                                        name);
                return 0;
        }

        switch (dev->restart_level) {

        case RESET_SUBSYS_COUPLED:
                __subsystem_restart_dev(dev);
                break;
        case RESET_SOC:
                __pm_stay_awake(&dev->ssr_wlock);
                schedule_work(&dev->device_restart_work);
                return 0;
        default:
                panic("subsys-restart: Unknown restart level!\n");
                break;
        }
        module_put(dev->owner);
        put_device(&dev->dev);

        return 0;
}

先看系统状态如果reboot/shutdown进行中就认为忙,退出。如果禁用了SSR,直接返回。下来是主要功能,按 ->restart_level 走不同的流程,这两个level定义:

static const char * const restart_levels[] = {
        [RESET_SOC] = "SYSTEM",
        [RESET_SUBSYS_COUPLED] = "RELATED",
};

先看 RESET_SOC ,从字面看就是系统级的了:

case RESET_SOC:
                __pm_stay_awake(&dev->ssr_wlock);
                schedule_work(&dev->device_restart_work);
                return 0;
struct subsys_device *subsys_register(struct subsys_desc *desc)
{
    ...
        INIT_WORK(&subsys->device_restart_work, device_restart_work_hdlr);
/**
 * struct subsys_device - subsystem device
 ...
 * @device_restart_work: work struct for device restart
 ...
 */
struct subsys_device {
        struct subsys_desc *desc;
        struct work_struct work;
        struct wakeup_source ssr_wlock;
        char wlname[64];
        struct work_struct device_restart_work;

ok, 进work task device_restart_work_hdlr() 看下:

static void device_restart_work_hdlr(struct work_struct *work)
{
        struct subsys_device *dev = container_of(work, struct subsys_device,
                                                        device_restart_work);

        notify_each_subsys_device(&dev, 1, SUBSYS_SOC_RESET, NULL);
        /*
         * Temporary workaround until ramdump userspace application calls
         * sync() and fclose() on attempting the dump.
         */
        msleep(100);
        panic("subsys-restart: Resetting the SoC - %s crashed.",
                                                        dev->desc->name);
}

call notify_each_subsys_device() 通知每个子系统复位:

static void notify_each_subsys_device(struct subsys_device **list,
                unsigned int count,
                enum subsys_notif_type notif, void *data)
{
        struct subsys_device *subsys;

        while (count--) {
                struct subsys_device *dev = *list++;
                struct notif_data notif_data;
                struct platform_device *pdev;

                if (!dev)
                        continue;

                pdev = container_of(dev->desc->dev, struct platform_device,
                                                                        dev);
                dev->notif_state = notif;

                mutex_lock(&subsys_list_lock);
                list_for_each_entry(subsys, &subsys_list, list)
                        if (dev != subsys &&
                                subsys->track.state == SUBSYS_ONLINE) {
                                setup_timeout(dev->desc, subsys->desc,
                                              SUBSYS_TO_SUBSYS_SYSMON);
                                sysmon_send_event(subsys->desc, dev->desc,
                                                  notif); //tj
                                cancel_timeout(dev->desc);
                        }
                mutex_unlock(&subsys_list_lock);

                if (notif == SUBSYS_AFTER_POWERUP &&
                                dev->track.state == SUBSYS_ONLINE)
                        send_sysmon_notif(dev);

                notif_data.crashed = subsys_get_crash_status(dev);
                notif_data.enable_ramdump = is_ramdump_enabled(dev); //tj
                notif_data.enable_mini_ramdumps = enable_mini_ramdumps;
                notif_data.no_auth = dev->desc->no_auth;
                notif_data.pdev = pdev;

                trace_pil_notif("before_send_notif", notif, dev->desc->fw_name);
                setup_timeout(dev->desc, NULL, SUBSYS_TO_HLOS);
                subsys_notif_queue_notification(dev->notify, notif,
                                                                ¬if_data);
                cancel_timeout(dev->desc);
                trace_pil_notif("after_send_notif", notif, dev->desc->fw_name);
                subsys_notif_uevent(dev->desc, notif);
        }
}

call list_for_each_entry() 去遍历每一个在线子系统, notif_data 包括是否使能 enable_ramdumpenable_mini_ramdumps 。如果使能,应该就去dump ram了。

so, 才能有:

/*
         * Temporary workaround until ramdump userspace application calls
         * sync() and fclose() on attempting the dump.
         */
        msleep(100);

am i right? 最后call kernel panic()。

ok, 我们再看另一个level: RELATED

case RESET_SUBSYS_COUPLED:
                __subsystem_restart_dev(dev);
                break;
static void __subsystem_restart_dev(struct subsys_device *dev)
{
        struct subsys_desc *desc = dev->desc;
        const char *name = dev->desc->name;
        struct subsys_tracking *track;
        unsigned long flags;

        pr_debug("Restarting %s [level=%s]!\n", desc->name,
                        restart_levels[dev->restart_level]);

        track = subsys_get_track(dev);
        /*
         * Allow drivers to call subsystem_restart{_dev}() as many times as
         * they want up until the point where the subsystem is shutdown.
         */
        spin_lock_irqsave(&track->s_lock, flags);
        if (track->p_state != SUBSYS_CRASHED &&
                                        dev->track.state == SUBSYS_ONLINE) {
                if (track->p_state != SUBSYS_RESTARTING) {
                        track->p_state = SUBSYS_CRASHED;
                        __pm_stay_awake(&dev->ssr_wlock);
                        queue_work(ssr_wq, &dev->work); //tj
                } else {
                        panic("Subsystem %s crashed during SSR!", name);
                }
        } else
                WARN(dev->track.state == SUBSYS_OFFLINE,
                        "SSR aborted: %s subsystem not online\n", name);
        spin_unlock_irqrestore(&track->s_lock, flags);
}
INIT_WORK(&subsys->work, subsystem_restart_wq_func);

check subsystem_restart_wq_func() :

static void subsystem_restart_wq_func(struct work_struct *work)
{
    ...
        pr_debug("[%s:%d]: Starting restart sequence for %s\n",
                        current->comm, current->pid, desc->name);
        notify_each_subsys_device(list, count, SUBSYS_BEFORE_SHUTDOWN, NULL);
        ret = for_each_subsys_device(list, count, NULL, subsystem_shutdown);
        if (ret)
                goto err;
        notify_each_subsys_device(list, count, SUBSYS_AFTER_SHUTDOWN, NULL);

        notify_each_subsys_device(list, count, SUBSYS_RAMDUMP_NOTIFICATION,
                                                                        NULL);

        spin_lock_irqsave(&track->s_lock, flags);
        track->p_state = SUBSYS_RESTARTING;
        spin_unlock_irqrestore(&track->s_lock, flags);

        /* Collect ram dumps for all subsystems in order here */
        for_each_subsys_device(list, count, NULL, subsystem_ramdump);

        for_each_subsys_device(list, count, NULL, subsystem_free_memory);

        notify_each_subsys_device(list, count, SUBSYS_BEFORE_POWERUP, NULL);
        ret = for_each_subsys_device(list, count, NULL, subsystem_powerup);
        if (ret)
                goto err;
        notify_each_subsys_device(list, count, SUBSYS_AFTER_POWERUP, NULL);

        pr_info("[%s:%d]: Restart sequence for %s completed.\n",
                        current->comm, current->pid, desc->name);
    ...
static int for_each_subsys_device(struct subsys_device **list,
                unsigned int count, void *data,
                int (*fn)(struct subsys_device *, void *))
{
        int ret;

        while (count--) {
                struct subsys_device *dev = *list++;

                if (!dev)
                        continue;
                ret = fn(dev, data);
                if (ret)
                        return ret;
        }
        return 0;
}

能看到这个接口在做ramdump:

static int subsystem_ramdump(struct subsys_device *dev, void *data)
{
        const char *name = dev->desc->name;

        if (dev->desc->ramdump)
                if (dev->desc->ramdump(is_ramdump_enabled(dev), dev->desc) < 0)
                        pr_warn("%s[%s:%d]: Ramdump failed.\n",
                                name, current->comm, current->pid);
        dev->do_ramdump_on_put = false;
        return 0;
}

可以看到,他没有像 SYSTEM level那样直接call kernel panic。也就是所谓的subsystem restart?

好了,到这里应该知道在哪里加入复位原因到block device了。如果不知道,可以参考下面的实现ifuwant:]

我来评几句
登录后评论

已发表评论数()

相关站点

热门文章