x86.c 178 KB
Newer Older
1
2
3
4
5
6
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * derived from drivers/kvm/kvm_main.c
 *
 * Copyright (C) 2006 Qumranet, Inc.
Ben-Ami Yassour's avatar
Ben-Ami Yassour committed
7
8
 * Copyright (C) 2008 Qumranet, Inc.
 * Copyright IBM Corporation, 2008
9
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10
11
12
13
 *
 * Authors:
 *   Avi Kivity   <avi@qumranet.com>
 *   Yaniv Kamay  <yaniv@qumranet.com>
Ben-Ami Yassour's avatar
Ben-Ami Yassour committed
14
15
 *   Amit Shah    <amit.shah@qumranet.com>
 *   Ben-Ami Yassour <benami@il.ibm.com>
16
17
18
19
20
21
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

22
#include <linux/kvm_host.h>
23
#include "irq.h"
24
#include "mmu.h"
Sheng Yang's avatar
Sheng Yang committed
25
#include "i8254.h"
26
#include "tss.h"
27
#include "kvm_cache_regs.h"
28
#include "x86.h"
Avi Kivity's avatar
Avi Kivity committed
29
#include "cpuid.h"
30

31
#include <linux/clocksource.h>
Ben-Ami Yassour's avatar
Ben-Ami Yassour committed
32
#include <linux/interrupt.h>
33
34
35
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
36
#include <linux/module.h>
37
#include <linux/mman.h>
38
#include <linux/highmem.h>
39
#include <linux/iommu.h>
40
#include <linux/intel-iommu.h>
41
#include <linux/cpufreq.h>
42
#include <linux/user-return-notifier.h>
43
#include <linux/srcu.h>
44
#include <linux/slab.h>
45
#include <linux/perf_event.h>
46
#include <linux/uaccess.h>
47
#include <linux/hash.h>
48
#include <linux/pci.h>
49
50
#include <linux/timekeeper_internal.h>
#include <linux/pvclock_gtod.h>
Avi Kivity's avatar
Avi Kivity committed
51
#include <trace/events/kvm.h>
Xiao Guangrong's avatar
Xiao Guangrong committed
52

53
54
#define CREATE_TRACE_POINTS
#include "trace.h"
55

56
#include <asm/debugreg.h>
57
#include <asm/msr.h>
58
#include <asm/desc.h>
Sheng Yang's avatar
Sheng Yang committed
59
#include <asm/mtrr.h>
Huang Ying's avatar
Huang Ying committed
60
#include <asm/mce.h>
61
#include <asm/i387.h>
62
#include <asm/fpu-internal.h> /* Ugh! */
Sheng Yang's avatar
Sheng Yang committed
63
#include <asm/xcr.h>
64
#include <asm/pvclock.h>
65
#include <asm/div64.h>
66

67
#define MAX_IO_MSRS 256
Huang Ying's avatar
Huang Ying committed
68
#define KVM_MAX_MCE_BANKS 32
69
#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
Huang Ying's avatar
Huang Ying committed
70

71
72
73
#define emul_to_vcpu(ctxt) \
	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)

74
75
76
77
78
/* EFER defaults:
 * - enable syscall per default because its emulated by KVM
 * - enable LME and LMA per default on 64 bit KVM
 */
#ifdef CONFIG_X86_64
79
80
static
u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
81
#else
82
static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
83
#endif
84

85
86
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
87

88
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
Avi Kivity's avatar
Avi Kivity committed
89
static void process_nmi(struct kvm_vcpu *vcpu);
90

91
struct kvm_x86_ops *kvm_x86_ops;
92
EXPORT_SYMBOL_GPL(kvm_x86_ops);
93

94
95
static bool ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
96

97
98
99
100
101
bool kvm_has_tsc_control;
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
u32  kvm_max_guest_tsc_khz;
EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);

102
103
104
105
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);

106
107
108
109
#define KVM_NR_SHARED_MSRS 16

struct kvm_shared_msrs_global {
	int nr;
110
	u32 msrs[KVM_NR_SHARED_MSRS];
111
112
113
114
115
};

struct kvm_shared_msrs {
	struct user_return_notifier urn;
	bool registered;
116
117
118
119
	struct kvm_shared_msr_values {
		u64 host;
		u64 curr;
	} values[KVM_NR_SHARED_MSRS];
120
121
122
};

static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
123
static struct kvm_shared_msrs __percpu *shared_msrs;
124

125
struct kvm_stats_debugfs_item debugfs_entries[] = {
126
127
128
129
130
131
132
133
134
	{ "pf_fixed", VCPU_STAT(pf_fixed) },
	{ "pf_guest", VCPU_STAT(pf_guest) },
	{ "tlb_flush", VCPU_STAT(tlb_flush) },
	{ "invlpg", VCPU_STAT(invlpg) },
	{ "exits", VCPU_STAT(exits) },
	{ "io_exits", VCPU_STAT(io_exits) },
	{ "mmio_exits", VCPU_STAT(mmio_exits) },
	{ "signal_exits", VCPU_STAT(signal_exits) },
	{ "irq_window", VCPU_STAT(irq_window_exits) },
135
	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
136
137
	{ "halt_exits", VCPU_STAT(halt_exits) },
	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
138
	{ "hypercalls", VCPU_STAT(hypercalls) },
139
140
141
142
143
144
145
	{ "request_irq", VCPU_STAT(request_irq_exits) },
	{ "irq_exits", VCPU_STAT(irq_exits) },
	{ "host_state_reload", VCPU_STAT(host_state_reload) },
	{ "efer_reload", VCPU_STAT(efer_reload) },
	{ "fpu_reload", VCPU_STAT(fpu_reload) },
	{ "insn_emulation", VCPU_STAT(insn_emulation) },
	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
146
	{ "irq_injections", VCPU_STAT(irq_injections) },
147
	{ "nmi_injections", VCPU_STAT(nmi_injections) },
Avi Kivity's avatar
Avi Kivity committed
148
149
150
151
152
153
	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
	{ "mmu_flooded", VM_STAT(mmu_flooded) },
	{ "mmu_recycled", VM_STAT(mmu_recycled) },
154
	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
155
	{ "mmu_unsync", VM_STAT(mmu_unsync) },
156
	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
Marcelo Tosatti's avatar
Marcelo Tosatti committed
157
	{ "largepages", VM_STAT(lpages) },
158
159
160
	{ NULL }
};

161
162
u64 __read_mostly host_xcr0;

163
static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
164

165
static int kvm_vcpu_reset(struct kvm_vcpu *vcpu);
166

167
168
169
170
171
172
173
static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
	int i;
	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
		vcpu->arch.apf.gfns[i] = ~0;
}

174
175
176
177
178
static void kvm_on_user_return(struct user_return_notifier *urn)
{
	unsigned slot;
	struct kvm_shared_msrs *locals
		= container_of(urn, struct kvm_shared_msrs, urn);
179
	struct kvm_shared_msr_values *values;
180
181

	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
182
183
184
185
		values = &locals->values[slot];
		if (values->host != values->curr) {
			wrmsrl(shared_msrs_global.msrs[slot], values->host);
			values->curr = values->host;
186
187
188
189
190
191
		}
	}
	locals->registered = false;
	user_return_notifier_unregister(urn);
}

192
static void shared_msr_update(unsigned slot, u32 msr)
193
194
{
	u64 value;
195
196
	unsigned int cpu = smp_processor_id();
	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
197

198
199
200
201
202
203
204
205
206
207
208
209
210
	/* only read, and nobody should modify it at this time,
	 * so don't need lock */
	if (slot >= shared_msrs_global.nr) {
		printk(KERN_ERR "kvm: invalid MSR slot!");
		return;
	}
	rdmsrl_safe(msr, &value);
	smsr->values[slot].host = value;
	smsr->values[slot].curr = value;
}

void kvm_define_shared_msr(unsigned slot, u32 msr)
{
211
212
	if (slot >= shared_msrs_global.nr)
		shared_msrs_global.nr = slot + 1;
213
214
215
	shared_msrs_global.msrs[slot] = msr;
	/* we need ensured the shared_msr_global have been updated */
	smp_wmb();
216
217
218
219
220
221
222
223
}
EXPORT_SYMBOL_GPL(kvm_define_shared_msr);

static void kvm_shared_msr_cpu_online(void)
{
	unsigned i;

	for (i = 0; i < shared_msrs_global.nr; ++i)
224
		shared_msr_update(i, shared_msrs_global.msrs[i]);
225
226
}

227
void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
228
{
229
230
	unsigned int cpu = smp_processor_id();
	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
231

232
	if (((value ^ smsr->values[slot].curr) & mask) == 0)
233
		return;
234
235
	smsr->values[slot].curr = value;
	wrmsrl(shared_msrs_global.msrs[slot], value);
236
237
238
239
240
241
242
243
	if (!smsr->registered) {
		smsr->urn.on_user_return = kvm_on_user_return;
		user_return_notifier_register(&smsr->urn);
		smsr->registered = true;
	}
}
EXPORT_SYMBOL_GPL(kvm_set_shared_msr);

244
245
static void drop_user_return_notifiers(void *ignore)
{
246
247
	unsigned int cpu = smp_processor_id();
	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
248
249
250
251
252

	if (smsr->registered)
		kvm_on_user_return(&smsr->urn);
}

253
254
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
255
	return vcpu->arch.apic_base;
256
257
258
259
260
261
}
EXPORT_SYMBOL_GPL(kvm_get_apic_base);

void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
{
	/* TODO: reserve bits check */
262
	kvm_lapic_set_base(vcpu, data);
263
264
265
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);

266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
#define EXCPT_BENIGN		0
#define EXCPT_CONTRIBUTORY	1
#define EXCPT_PF		2

static int exception_class(int vector)
{
	switch (vector) {
	case PF_VECTOR:
		return EXCPT_PF;
	case DE_VECTOR:
	case TS_VECTOR:
	case NP_VECTOR:
	case SS_VECTOR:
	case GP_VECTOR:
		return EXCPT_CONTRIBUTORY;
	default:
		break;
	}
	return EXCPT_BENIGN;
}

static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
288
289
		unsigned nr, bool has_error, u32 error_code,
		bool reinject)
290
291
292
293
{
	u32 prev_nr;
	int class1, class2;

294
295
	kvm_make_request(KVM_REQ_EVENT, vcpu);

296
297
298
299
300
301
	if (!vcpu->arch.exception.pending) {
	queue:
		vcpu->arch.exception.pending = true;
		vcpu->arch.exception.has_error_code = has_error;
		vcpu->arch.exception.nr = nr;
		vcpu->arch.exception.error_code = error_code;
302
		vcpu->arch.exception.reinject = reinject;
303
304
305
306
307
308
309
		return;
	}

	/* to check exception */
	prev_nr = vcpu->arch.exception.nr;
	if (prev_nr == DF_VECTOR) {
		/* triple fault -> shutdown */
310
		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
		return;
	}
	class1 = exception_class(prev_nr);
	class2 = exception_class(nr);
	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
		/* generate double fault per SDM Table 5-5 */
		vcpu->arch.exception.pending = true;
		vcpu->arch.exception.has_error_code = true;
		vcpu->arch.exception.nr = DF_VECTOR;
		vcpu->arch.exception.error_code = 0;
	} else
		/* replace previous exception with a new one in a hope
		   that instruction re-execution will regenerate lost
		   exception */
		goto queue;
}

329
330
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
331
	kvm_multiple_exception(vcpu, nr, false, 0, false);
332
333
334
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);

335
336
337
338
339
340
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
	kvm_multiple_exception(vcpu, nr, false, 0, true);
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);

341
void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
342
{
343
344
345
346
347
348
	if (err)
		kvm_inject_gp(vcpu, 0);
	else
		kvm_x86_ops->skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
349

350
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
351
352
{
	++vcpu->stat.pf_guest;
353
354
	vcpu->arch.cr2 = fault->address;
	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
355
}
Nadav Har'El's avatar
Nadav Har'El committed
356
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
357

358
void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
359
{
360
361
	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
		vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
362
	else
363
		vcpu->arch.mmu.inject_page_fault(vcpu, fault);
364
365
}

366
367
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
Avi Kivity's avatar
Avi Kivity committed
368
369
	atomic_inc(&vcpu->arch.nmi_queued);
	kvm_make_request(KVM_REQ_NMI, vcpu);
370
371
372
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);

373
374
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
375
	kvm_multiple_exception(vcpu, nr, true, error_code, false);
376
377
378
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);

379
380
381
382
383
384
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
	kvm_multiple_exception(vcpu, nr, true, error_code, true);
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);

385
386
387
388
389
/*
 * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
 * a #GP and return false.
 */
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
390
{
391
392
393
394
	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
		return true;
	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
	return false;
395
}
396
EXPORT_SYMBOL_GPL(kvm_require_cpl);
397

398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
/*
 * This function will be used to read from the physical memory of the currently
 * running guest. The difference to kvm_read_guest_page is that this function
 * can read from guest physical or from the guest's guest physical memory.
 */
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
			    gfn_t ngfn, void *data, int offset, int len,
			    u32 access)
{
	gfn_t real_gfn;
	gpa_t ngpa;

	ngpa     = gfn_to_gpa(ngfn);
	real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
	if (real_gfn == UNMAPPED_GVA)
		return -EFAULT;

	real_gfn = gpa_to_gfn(real_gfn);

	return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);

421
422
423
424
425
426
427
int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
			       void *data, int offset, int len, u32 access)
{
	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
				       data, offset, len, access);
}

428
429
430
/*
 * Load the pae pdptrs.  Return true is they are all valid.
 */
431
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
432
433
434
435
436
{
	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
	int i;
	int ret;
437
	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
438

439
440
441
	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
				      offset * sizeof(u64), sizeof(pdpte),
				      PFERR_USER_MASK|PFERR_WRITE_MASK);
442
443
444
445
446
	if (ret < 0) {
		ret = 0;
		goto out;
	}
	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
447
		if (is_present_gpte(pdpte[i]) &&
448
		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
449
450
451
452
453
454
			ret = 0;
			goto out;
		}
	}
	ret = 1;

455
	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
Avi Kivity's avatar
Avi Kivity committed
456
457
458
459
	__set_bit(VCPU_EXREG_PDPTR,
		  (unsigned long *)&vcpu->arch.regs_avail);
	__set_bit(VCPU_EXREG_PDPTR,
		  (unsigned long *)&vcpu->arch.regs_dirty);
460
461
462
463
out:

	return ret;
}
464
EXPORT_SYMBOL_GPL(load_pdptrs);
465

466
467
static bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
468
	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
469
	bool changed = true;
470
471
	int offset;
	gfn_t gfn;
472
473
474
475
476
	int r;

	if (is_long_mode(vcpu) || !is_pae(vcpu))
		return false;

Avi Kivity's avatar
Avi Kivity committed
477
478
479
480
	if (!test_bit(VCPU_EXREG_PDPTR,
		      (unsigned long *)&vcpu->arch.regs_avail))
		return true;

481
482
	gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
	offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
483
484
	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
				       PFERR_USER_MASK | PFERR_WRITE_MASK);
485
486
	if (r < 0)
		goto out;
487
	changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
488
489
490
491
492
out:

	return changed;
}

493
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
494
{
495
496
497
498
	unsigned long old_cr0 = kvm_read_cr0(vcpu);
	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
				    X86_CR0_CD | X86_CR0_NW;

499
500
	cr0 |= X86_CR0_ET;

501
#ifdef CONFIG_X86_64
502
503
	if (cr0 & 0xffffffff00000000UL)
		return 1;
504
505
506
#endif

	cr0 &= ~CR0_RESERVED_BITS;
507

508
509
	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
		return 1;
510

511
512
	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
		return 1;
513
514
515

	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
516
		if ((vcpu->arch.efer & EFER_LME)) {
517
518
			int cs_db, cs_l;

519
520
			if (!is_pae(vcpu))
				return 1;
521
			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
522
523
			if (cs_l)
				return 1;
524
525
		} else
#endif
526
		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
527
						 kvm_read_cr3(vcpu)))
528
			return 1;
529
530
	}

531
532
533
	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
		return 1;

534
535
	kvm_x86_ops->set_cr0(vcpu, cr0);

536
	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
537
		kvm_clear_async_pf_completion_queue(vcpu);
538
539
		kvm_async_pf_hash_reset(vcpu);
	}
540

541
542
	if ((cr0 ^ old_cr0) & update_bits)
		kvm_mmu_reset_context(vcpu);
543
544
	return 0;
}
545
EXPORT_SYMBOL_GPL(kvm_set_cr0);
546

547
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
548
{
549
	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
550
}
551
EXPORT_SYMBOL_GPL(kvm_lmsw);
552

553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
	u64 xcr0;

	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
	if (index != XCR_XFEATURE_ENABLED_MASK)
		return 1;
	xcr0 = xcr;
	if (kvm_x86_ops->get_cpl(vcpu) != 0)
		return 1;
	if (!(xcr0 & XSTATE_FP))
		return 1;
	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
		return 1;
	if (xcr0 & ~host_xcr0)
		return 1;
	vcpu->arch.xcr0 = xcr0;
	vcpu->guest_xcr0_loaded = 0;
	return 0;
}

int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
	if (__kvm_set_xcr(vcpu, index, xcr)) {
		kvm_inject_gp(vcpu, 0);
		return 1;
	}
	return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);

584
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
585
{
586
	unsigned long old_cr4 = kvm_read_cr4(vcpu);
587
588
	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
				   X86_CR4_PAE | X86_CR4_SMEP;
589
590
	if (cr4 & CR4_RESERVED_BITS)
		return 1;
591

592
593
594
	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
		return 1;

595
596
597
	if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
		return 1;

598
599
600
	if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
		return 1;

601
	if (is_long_mode(vcpu)) {
602
603
		if (!(cr4 & X86_CR4_PAE))
			return 1;
604
605
	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
		   && ((cr4 ^ old_cr4) & pdptr_bits)
606
607
		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
				   kvm_read_cr3(vcpu)))
608
609
		return 1;

610
611
612
613
614
615
616
617
618
	if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
		if (!guest_cpuid_has_pcid(vcpu))
			return 1;

		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
			return 1;
	}

619
	if (kvm_x86_ops->set_cr4(vcpu, cr4))
620
		return 1;
621

622
623
	if (((cr4 ^ old_cr4) & pdptr_bits) ||
	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
624
		kvm_mmu_reset_context(vcpu);
625

626
	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
Avi Kivity's avatar
Avi Kivity committed
627
		kvm_update_cpuid(vcpu);
628

629
630
	return 0;
}
631
EXPORT_SYMBOL_GPL(kvm_set_cr4);
632

633
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
634
{
635
	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
636
		kvm_mmu_sync_roots(vcpu);
637
		kvm_mmu_flush_tlb(vcpu);
638
		return 0;
639
640
	}

641
	if (is_long_mode(vcpu)) {
642
		if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
643
644
645
646
647
			if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
				return 1;
		} else
			if (cr3 & CR3_L_MODE_RESERVED_BITS)
				return 1;
648
649
	} else {
		if (is_pae(vcpu)) {
650
651
			if (cr3 & CR3_PAE_RESERVED_BITS)
				return 1;
652
653
			if (is_paging(vcpu) &&
			    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
654
				return 1;
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
		}
		/*
		 * We don't check reserved bits in nonpae mode, because
		 * this isn't enforced, and VMware depends on this.
		 */
	}

	/*
	 * Does the new cr3 value map to physical memory? (Note, we
	 * catch an invalid cr3 even in real-mode, because it would
	 * cause trouble later on when we turn on paging anyway.)
	 *
	 * A real CPU would silently accept an invalid cr3 and would
	 * attempt to use it - with largely undefined (and often hard
	 * to debug) behavior on the guest side.
	 */
	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
672
673
		return 1;
	vcpu->arch.cr3 = cr3;
674
	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
675
676
677
	vcpu->arch.mmu.new_cr3(vcpu);
	return 0;
}
678
EXPORT_SYMBOL_GPL(kvm_set_cr3);
679

Andre Przywara's avatar
Andre Przywara committed
680
int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
681
{
682
683
	if (cr8 & CR8_RESERVED_BITS)
		return 1;
684
685
686
	if (irqchip_in_kernel(vcpu->kvm))
		kvm_lapic_set_tpr(vcpu, cr8);
	else
687
		vcpu->arch.cr8 = cr8;
688
689
	return 0;
}
690
EXPORT_SYMBOL_GPL(kvm_set_cr8);
691

692
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
693
694
695
696
{
	if (irqchip_in_kernel(vcpu->kvm))
		return kvm_lapic_get_cr8(vcpu);
	else
697
		return vcpu->arch.cr8;
698
}
699
EXPORT_SYMBOL_GPL(kvm_get_cr8);
700

701
702
703
704
705
706
707
708
709
710
711
712
static void kvm_update_dr7(struct kvm_vcpu *vcpu)
{
	unsigned long dr7;

	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
		dr7 = vcpu->arch.guest_debug_dr7;
	else
		dr7 = vcpu->arch.dr7;
	kvm_x86_ops->set_dr7(vcpu, dr7);
	vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
}

713
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
714
715
716
717
718
719
720
721
{
	switch (dr) {
	case 0 ... 3:
		vcpu->arch.db[dr] = val;
		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
			vcpu->arch.eff_db[dr] = val;
		break;
	case 4:
722
723
		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
			return 1; /* #UD */
724
725
		/* fall through */
	case 6:
726
727
		if (val & 0xffffffff00000000ULL)
			return -1; /* #GP */
728
729
730
		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
		break;
	case 5:
731
732
		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
			return 1; /* #UD */
733
734
		/* fall through */
	default: /* 7 */
735
736
		if (val & 0xffffffff00000000ULL)
			return -1; /* #GP */
737
		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
738
		kvm_update_dr7(vcpu);
739
740
741
742
743
		break;
	}

	return 0;
}
744
745
746
747
748
749
750
751
752
753
754
755
756

int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
	int res;

	res = __kvm_set_dr(vcpu, dr, val);
	if (res > 0)
		kvm_queue_exception(vcpu, UD_VECTOR);
	else if (res < 0)
		kvm_inject_gp(vcpu, 0);

	return res;
}
757
758
EXPORT_SYMBOL_GPL(kvm_set_dr);

759
static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
760
761
762
763
764
765
{
	switch (dr) {
	case 0 ... 3:
		*val = vcpu->arch.db[dr];
		break;
	case 4:
766
		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
767
768
769
770
771
772
			return 1;
		/* fall through */
	case 6:
		*val = vcpu->arch.dr6;
		break;
	case 5:
773
		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
774
775
776
777
778
779
780
781
782
			return 1;
		/* fall through */
	default: /* 7 */
		*val = vcpu->arch.dr7;
		break;
	}

	return 0;
}
783
784
785
786
787
788
789
790
791

int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
	if (_kvm_get_dr(vcpu, dr, val)) {
		kvm_queue_exception(vcpu, UD_VECTOR);
		return 1;
	}
	return 0;
}
792
793
EXPORT_SYMBOL_GPL(kvm_get_dr);

Avi Kivity's avatar
Avi Kivity committed
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
bool kvm_rdpmc(struct kvm_vcpu *vcpu)
{
	u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
	u64 data;
	int err;

	err = kvm_pmu_read_pmc(vcpu, ecx, &data);
	if (err)
		return err;
	kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
	kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
	return err;
}
EXPORT_SYMBOL_GPL(kvm_rdpmc);

809
810
811
812
813
/*
 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 *
 * This list is modified at module load time to reflect the
814
815
 * capabilities of the host cpu. This capabilities test skips MSRs that are
 * kvm-specific. Those are put in the beginning of the list.
816
 */
817

818
#define KVM_SAVE_MSRS_BEGIN	10
819
static u32 msrs_to_save[] = {
820
	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
821
	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
822
	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
Glauber Costa's avatar
Glauber Costa committed
823
	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
824
	MSR_KVM_PV_EOI_EN,
825
	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
Brian Gerst's avatar
Brian Gerst committed
826
	MSR_STAR,
827
828
829
#ifdef CONFIG_X86_64
	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
830
	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
831
832
833
834
};

static unsigned num_msrs_to_save;

Mathias Krause's avatar
Mathias Krause committed
835
static const u32 emulated_msrs[] = {
836
	MSR_IA32_TSC_ADJUST,
837
	MSR_IA32_TSCDEADLINE,
838
	MSR_IA32_MISC_ENABLE,
839
840
	MSR_IA32_MCG_STATUS,
	MSR_IA32_MCG_CTL,
841
842
};

843
static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
844
{
845
846
	u64 old_efer = vcpu->arch.efer;

847
848
	if (efer & efer_reserved_bits)
		return 1;
849
850

	if (is_paging(vcpu)
851
852
	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
		return 1;
853

Alexander Graf's avatar
Alexander Graf committed
854
855
856
857
	if (efer & EFER_FFXSR) {
		struct kvm_cpuid_entry2 *feat;

		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
858
859
		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
			return 1;
Alexander Graf's avatar
Alexander Graf committed
860
861
	}

862
863
864
865
	if (efer & EFER_SVME) {
		struct kvm_cpuid_entry2 *feat;

		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
866
867
		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
			return 1;
868
869
	}

870
	efer &= ~EFER_LMA;
871
	efer |= vcpu->arch.efer & EFER_LMA;
872

873
874
	kvm_x86_ops->set_efer(vcpu, efer);

875
	vcpu->arch.mmu.base_role.nxe = (efer & EFER_NX) && !tdp_enabled;
876

877
878
879
880
	/* Update reserved bits */
	if ((efer ^ old_efer) & EFER_NX)
		kvm_mmu_reset_context(vcpu);

881
	return 0;
882
883
}

884
885
886
887
888
889
890
void kvm_enable_efer_bits(u64 mask)
{
       efer_reserved_bits &= ~mask;
}
EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);


891
892
893
894
895
/*
 * Writes msr value into into the appropriate "register".
 * Returns 0 on success, non-0 otherwise.
 * Assumes vcpu_load() was already called.
 */
896
int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
897
{
898
	return kvm_x86_ops->set_msr(vcpu, msr);
899
900
}

901
902
903
904
905
/*
 * Adapt set_msr() to msr_io()'s calling convention
 */
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
906
907
908
909
910
911
	struct msr_data msr;

	msr.data = *data;
	msr.index = index;
	msr.host_initiated = true;
	return kvm_set_msr(vcpu, &msr);
912
913
}

914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
#ifdef CONFIG_X86_64
struct pvclock_gtod_data {
	seqcount_t	seq;

	struct { /* extract of a clocksource struct */
		int vclock_mode;
		cycle_t	cycle_last;
		cycle_t	mask;
		u32	mult;
		u32	shift;
	} clock;

	/* open coded 'struct timespec' */
	u64		monotonic_time_snsec;
	time_t		monotonic_time_sec;
};

static struct pvclock_gtod_data pvclock_gtod_data;

static void update_pvclock_gtod(struct timekeeper *tk)
{
	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;

	write_seqcount_begin(&vdata->seq);

	/* copy pvclock gtod data */
	vdata->clock.vclock_mode	= tk->clock->archdata.vclock_mode;
	vdata->clock.cycle_last		= tk->clock->cycle_last;
	vdata->clock.mask		= tk->clock->mask;
	vdata->clock.mult		= tk->mult;
	vdata->clock.shift		= tk->shift;

	vdata->monotonic_time_sec	= tk->xtime_sec
					+ tk->wall_to_monotonic.tv_sec;
	vdata->monotonic_time_snsec	= tk->xtime_nsec
					+ (tk->wall_to_monotonic.tv_nsec
						<< tk->shift);
	while (vdata->monotonic_time_snsec >=
					(((u64)NSEC_PER_SEC) << tk->shift)) {
		vdata->monotonic_time_snsec -=
					((u64)NSEC_PER_SEC) << tk->shift;
		vdata->monotonic_time_sec++;
	}

	write_seqcount_end(&vdata->seq);
}
#endif


963
964
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
965
966
	int version;
	int r;
967
	struct pvclock_wall_clock wc;
968
	struct timespec boot;
969
970
971
972

	if (!wall_clock)
		return;

973
974
975
976
977
978
979
980
	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
	if (r)
		return;

	if (version & 1)
		++version;  /* first time write, random junk */

	++version;
981
982
983

	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));

984
985
	/*
	 * The guest calculates current wall clock time by adding
986
	 * system time (updated by kvm_guest_time_update below) to the
987
988
989
	 * wall clock specified here.  guest system time equals host
	 * system time for us, thus we must fill in host boot time here.
	 */
990
	getboottime(&boot);
991

992
993
994
995
	if (kvm->arch.kvmclock_offset) {
		struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
		boot = timespec_sub(boot, ts);
	}
996
997
998
	wc.sec = boot.tv_sec;
	wc.nsec = boot.tv_nsec;
	wc.version = version;
999
1000

	kvm_write_guest(kvm, wall_clock, &wc, sizeof(wc));
For faster browsing, not all history is shown. View entire blame