分类
KSM -x64 hypervisor 开源项目

Ksm开源Vt项目分析 1 Ksm的Vcpu-EPT初始化

Ksmd的启动是通过ksm_subvert这个函数来的 函数原型 ksm_subvert(struct ksm *k);

static int __g_dpc_logical_rval = 0;

#ifndef __linux__
NTKERNELAPI VOID KeGenericCallDpc(PKDEFERRED_ROUTINE Routine,
                  PVOID Context);
NTKERNELAPI VOID KeSignalCallDpcDone(PVOID SystemArgument1);
NTKERNELAPI LOGICAL KeSignalCallDpcSynchronize(PVOID SystemArgument2);

#define DEFINE_DPC(name, call, ...) 
    VOID __percpu_##name(PRKDPC dpc, void *ctx, void *sys0, void *sys1) 
    {                                   
        UNREFERENCED_PARAMETER(dpc);                    
        __g_dpc_logical_rval |= (call) (__VA_ARGS__);           
        KeSignalCallDpcSynchronize(sys1);               
        KeSignalCallDpcDone(sys0);                  
    }

#define CALL_DPC(name, ...) do {                        
    __g_dpc_logical_rval = 0;                       
    KeGenericCallDpc(__percpu_##name, __VA_ARGS__);             
} while (0)

#define CALL_DPC_ON_CPU(cpu, name, fail, ...) do {              
    __g_dpc_logical_rval = 0;                       
    PROCESSOR_NUMBER proc_nr;                       
    KeGetProcessorNumberFromIndex((cpu), &proc_nr);             
    PKDPC dpc = mm_alloc_pool(sizeof(*dpc));                
    if (!dpc)                               
        fail;                               
    KeInitializeDpc(dpc, __percpu_##name, __VA_ARGS__); 初始化DPC      
    KeSetImportanceDpc(dpc, HighImportance);                设置DPC对象优先级 。高  中 低
    KeSetTargetProcessorDpcEx(dpc, &proc_nr);               挂入到指定Cpu
    KeInsertQueueDpc(dpc, NULL, NULL);  
win内核中每个Cpu都有个属于自己的Kprc结构 DPC被维护在其中DpcData[2]和DpcStack成员
typedefstruct_KDPC_DATA
{
LIST_ENTRYDpcListHead;//DPC队列头;
ULONGDpcLock;
//DPC队列锁,操作队列要先获得锁;
volatileULONGDpcQueueDepth;
ULONGDpcCount;
}KDPC_DATA,*PKDPC_DATA;
分为两种DPC一种是普通dpc,另外一种是由内核中一个专门线程的来执行的DPC 普通dpc可以运行在任意线程上下文。。擦扯远了

} while (0)
#else
#define DEFINE_DPC(name, call, ...)                     
    void __percpu_##name(void *ctx)                     
    {                                   
        __g_dpc_logical_rval |= (call) (__VA_ARGS__);           
    }

#define CALL_DPC(name, ...) do {                        
    int cpu;                                
    __g_dpc_logical_rval = 0;                       
    for_each_online_cpu(cpu)                        
        smp_call_function_single(cpu, __percpu_##name, __VA_ARGS__, 1); 
} while (0)

#define CALL_DPC_ON_CPU(cpu, name, fail, ...) do {              
    __g_dpc_logical_rval = 0;                       
    smp_call_function_single(cpu, __percpu_##name, __VA_ARGS__, 1);     
} while (0)
#endif

#define DPC_RET()   __g_dpc_logical_rval
#endif
不得不说作者的代码看着很舒服,我一直在学习这样人的编码风格,asamy的宏真的看着舒服
static DEFINE_DPC(__call_init, __ksm_init_cpu, ctx);
int ksm_subvert(struct ksm *k)
{
    CALL_DPC(__call_init, k);每个U上开始XO
    return DPC_RET();
}
/*
 * Virtualizes current CPU.
 */
int __ksm_init_cpu(struct ksm *k)
{
    struct vcpu *vcpu;
    int ret = ERR_NOMEM;
    u64 feat_ctl;
    u64 required_feat_bits = FEATURE_CONTROL_LOCKED |
        FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX;

    vcpu = ksm_cpu(k);
    if (vcpu->subverted) {
是否侵染过了
        KSM_DEBUG_RAW("CPU already subvertedn");
        return 0;
    }

#ifdef __linux__
    if (tboot_enabled())
        required_feat_bits |= FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX;
#endif

    feat_ctl = __readmsr(MSR_IA32_FEATURE_CONTROL);
    if ((feat_ctl & required_feat_bits) != required_feat_bits) {
        if (feat_ctl & FEATURE_CONTROL_LOCKED)
            return ERR_BUSY;

        __writemsr(MSR_IA32_FEATURE_CONTROL, feat_ctl | required_feat_bits);
        feat_ctl = __readmsr(MSR_IA32_FEATURE_CONTROL);
        if ((feat_ctl & required_feat_bits) != required_feat_bits)
            return ERR_DENIED;
    }
设置进入hv层的MSR
    ret = vcpu_init(vcpu);
    if (ret < 0) {
        KSM_DEBUG_RAW("failed to create vcpu, oom?n");
        return ret;
    }

    /* Saves state and calls vcpu_run() (Defined in assembly, vmx.{S,asm} */
    ret = __vmx_vminit(vcpu);初始化 idt shadow nested cpu ept list等数据
    KSM_DEBUG("%s: Started: %dn", proc_name(), !ret);

    if (ret < 0)
        goto out;

    vcpu->subverted = true;
    k->active_vcpus++; // 总结构CPU个数+1 ksm管理vcpu结构是这样的  申请一大片内存 然后均分对应 达到 vcpumem[currentCpu]
    return 0;

out:
    vcpu_free(vcpu);//释放vcpu内存
    __writecr4(__readcr4() & ~X86_CR4_VMXE);//还原vmxme
    return ret;
}
实际对Vcpu进行侵染的

下面来看int vcpu_init(struct vcpu *vcpu)这个函数  对 vcpu数据结构进行填充 例如ksm实现的shadow idt ept switch NESTED_VMX cr0_read_shadow等等
int vcpu_init(struct vcpu *vcpu)
{
    /*
     * This is gonna hold the shadow IDT, which they won't see, but it's
     * the one that'll they be using.
     */
    vcpu->idt.limit = PAGE_SIZE - 1;
    vcpu->idt.base = (uintptr_t)mm_alloc_page();//分配用来做shadow idt list保存原有idt表的内存
    if (!vcpu->idt.base)
        return ERR_NOMEM;

    if (!init_ept(&vcpu->ept)) { //分配eptUse 数*表的内存
        mm_free_page((void *)vcpu->idt.base);
        return ERR_NOMEM;
    }

#ifdef NESTED_VMX
    vcpu->nested_vcpu.feat_ctl = __readmsr(MSR_IA32_FEATURE_CONTROL) & ~FEATURE_CONTROL_LOCKED;
#endif

    /*
     * Leave cr0 guest host mask empty, we support all.
     * Set VMXE bit in cr4 guest host mask so they VM-exit to us when
     * they try to set that bit.
     *
     * Note: These bits are also removed from CRx_READ_SHADOW fields, if
     * you want to opt-in a VM exit without having to remove that bit
     * completely from their CR0, then you'd probably want to make
     * a different variable, e.g. `cr0_read_shadow = X86_CR0_PE` and OR it
     * in CR0_GUEST_HOST_MASK, without masking it in CR0_READ_SHADOW...
     */
    vcpu->cr0_guest_host_mask = 0;
    vcpu->cr4_guest_host_mask = X86_CR4_VMXE;

    *(struct vcpu **)((uintptr_t)vcpu->stack + KERNEL_STACK_SIZE - 8) = vcpu;
    return 0;
}
下面是初始化ept 这里和网上常见代码不同的是。
static inline bool init_ept(struct ept *ept)
{
    int i;
    u16 dontcare;

    for (i = 0; i < EPTP_INIT_USED; ++i) { EPTP_INIT_USED使用了几张表 比如读的时候要切换的表 写的时候要切换的表  原来没有修改过的表
        if (!ept_create_ptr(ept, EPT_ACCESS_ALL, &dontcare)) {//创建ept table ptr
            free_pml4_list(ept);
            return false;
        }
    }

    return true;
}
bool ept_create_ptr(struct ept *ept, int access, u16 *out)
{
    u64 **pml4;
    u16 eptp;

    eptp = (u16)find_first_zero_bit(ept->ptr_bitmap, sizeof(ept->ptr_bitmap));//KSM默认是支持512张EPT表的 获取已经可以使用的EPT LIST INDEX
    if (eptp == sizeof(ept->ptr_bitmap)) //和最大支持相等 就返回失败了
        return false;

    pml4 = &EPT4(ept, eptp);//从ept_list中获取pml4的ptr
    if (!(*pml4 = mm_alloc_page()))//分配内存
        return false;

    if (!setup_pml4(ept, access, eptp)) {//安装ept表
        __mm_free_page(*pml4);
        return false;
    }

    EPTP(ept, eptp) = mkeptp(__pa(*pml4));
    set_bit(eptp, ept->ptr_bitmap);
    *out = eptp;
    return true;
}
static bool setup_pml4(struct ept *ept, int access, u16 eptp)
{
    /*
     * On Linux, this doesn't have to be done, and we can get each
     * one as a violation, on Windows, the kernel screams and hangs.
     *
     * See mm_cache_ram_ranges() in mm.c for how this is optained.
     */
    int i;
    u64 addr;
    u64 apic;
    struct pmem_range *range;

    for (i = 0; i < ksm->range_count; ++i) {
        range = &ksm->ranges[i];
        for (addr = range->start; addr < range->end; addr += PAGE_SIZE) {//物理内存边界位置
            int r = access;
            if (access != EPT_ACCESS_ALL && mm_is_kernel_addr(__va(addr)))//如果该物理内存在当前CR3转换后得到的虚拟内存是内核的那么赋予完整权限
                r = EPT_ACCESS_ALL;

            if (!ept_alloc_page(EPT4(ept, eptp), r, addr, addr))//初始化
                return false;
        }
    }

    /* Allocate APIC page  */
    apic = __readmsr(MSR_IA32_APICBASE) & MSR_IA32_APICBASE_BASE;
    if (!ept_alloc_page(EPT4(ept, eptp), EPT_ACCESS_ALL, apic, apic))
        return false;

    return true;
}

u64 *ept_alloc_page(u64 *pml4, int access, u64 gpa, u64 hpa)
{
    /* PML4 (512 GB) */
    u64 *pml4e = &pml4[PGD_INDEX_P(gpa)];
    u64 *pdpt = ept_page_addr(pml4e);

    if (!pdpt) {
        pdpt = mm_alloc_page();
        if (!pdpt)
            return NULL;

        *pml4e = mkepte(EPT_ACCESS_ALL, __pa(pdpt));
    }

    /* PDPT (1 GB)  */
    u64 *pdpte = &pdpt[PUD_INDEX_P(gpa)];
    u64 *pdt = ept_page_addr(pdpte);
    if (!pdt) {
        pdt = mm_alloc_page();
        if (!pdt)
            return NULL;

        *pdpte = mkepte(EPT_ACCESS_ALL, __pa(pdt));
    }

    /* PDT (2 MB)  */
    u64 *pdte = &pdt[PMD_INDEX_P(gpa)];
    u64 *pt = ept_page_addr(pdte);
    if (!pt) {
        pt = mm_alloc_page();
        if (!pt)
            return NULL;

        *pdte = mkepte(EPT_ACCESS_ALL, __pa(pt));
    }

    /* PT (4 KB)  */
    u64 *page = &pt[PTE_INDEX_P(gpa)];
    *page = mkepte(access, hpa);
    *page |= EPT_MT_WRITEBACK << VMX_EPT_MT_EPTE_SHIFT;
    return page;
}

/*
 * Get a PTE for the specified guest physical address, this can be used
 * to get the host physical address it redirects to or redirect to it.
 *
 * To redirect to an HPA (Host physical address):
 * code
 *  struct ept *ept = &vcpu->ept;
 *  u64 *epte = ept_pte(EPT4(ept, EPTP_EXHOOK), gpa);
 *  __set_epte_pfn(epte, hpa >> PAGE_SHIFT);
 *  __invept_all();
 * endcode
 *
 * Similarly, to get the HPA:
 * code
 *  struct ept *ept = &vcpu->ept;
 *  u64 *epte = ept_pte(EPT4(ept, EPTP_EXHOOK), gpa);
 *  u64 hpa = *epte & PAGE_PA_MASK;
 *  u64 hfn = hpa >> PAGE_SHIFT;
 * endcode
 */
u64 *ept_pte(u64 *pml4, u64 gpa)
{
    u64 *pdpt, *pdt, *pd;
    u64 *pdpte, *pdte;

    pdpt = ept_page_addr(&pml4[PGD_INDEX_P(gpa)]);
    if (!pdpt)
        return 0;

    pdpte = &pdpt[PUD_INDEX_P(gpa)];
    pdt = ept_page_addr(pdpte);
    if (!pdt)
        return 0;

    if (*pdpte & PAGE_LARGE)
        return pdpte;   /* 1 GB  */

    pdte = &pdt[PMD_INDEX_P(gpa)];
    pd = ept_page_addr(pdte);
    if (!pd)
        return 0;

    if (*pdte & PAGE_LARGE)
        return pdte;    /* 2 MB  */

    return &pd[PTE_INDEX_P(gpa)];   /* 4 KB  */
}
上面两个函数和其他的VT开源项目就没啥去吧了。初始化表和打标记了
    /* Saves state and calls vcpu_run() (Defined in assembly, vmx.{S,asm} */
    ret = __vmx_vminit(vcpu);开始侵染
    KSM_DEBUG("%s: Started: %dn", proc_name(), !ret);

    if (ret < 0)
        goto out;

    vcpu->subverted = true; 侵染成功
    k->active_vcpus++;虚拟CPU数+1
    return 0;
到这里数据就初始化完了,准备填写Vmcs了
__vmx_vminit PROC
    pushfq
    PUSHAQ          ; -8 * 16

    ; rcx contains vcpu
    mov rdx, rsp    ; SP 栈地址 执行后从HOST回到guest的GSP
    mov r8, do_resume   ; IP after success 执行后从HOST回到guest的GIP

    sub rsp, 20h
    call    vcpu_run
    add rsp, 20h

    ; if we get here, we failed
    POPAQ
    popfq

    mov eax, -1
    ret

do_resume:
    POPAQ
    popfq

    xor eax, eax
    ret
__vmx_vminit ENDP

..........班门弄斧,KSM的版本更新比较频繁。我也很久没有写过代码了。凭借记忆写下这篇文章如果有误 还请大家指明。
0 0 vote
文章评分

由FAKE

Через тернии к звездам,
через радость и слезы
Мы проложим дорогу

Subscribe
提醒
guest
你的昵称 用于分别你是谁
你的电子邮箱 用于被回复时通知
8 评论
Inline Feedbacks
View all comments
jie
jie
游客
2017年6月3日 下午9:36

谢谢楼主分享,正在对比KVM和KSM的代码,感觉整体数据结构也不太一样,想试着在KVM中也建个多个EPT表,不过不知道要怎么实现?

jie
jie
游客
Reply to  FAKE
2017年6月4日 下午2:24

XEN中是有EPTSwitch的,叫altp2m,不过kvm我好像没有看到,不知道楼主了不了解 :redface:

jie
jie
游客
Reply to  FAKE
2017年6月5日 下午4:55

我想在kvm_vcpu_arch中,将strcut kvm_mmu mmu;替换为struct kvm_mmu *eptList; 即用多个kvm_mmu来存储多个root_hpa(ept pointer),但是在实现上好像有些问题,vcpu->arch.mmu在太多地方都用到了,可以邮件或者其他方式联系您吗?想要学习交流一下~

jie
jie
游客
Reply to  FAKE
2017年6月25日 下午10:10

QQ需要回答问题,楼主方便加我吗?654423774