x86.c 181 KB
Newer Older
1
2
3
4
5
6
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * derived from drivers/kvm/kvm_main.c
 *
 * Copyright (C) 2006 Qumranet, Inc.
Ben-Ami Yassour's avatar
Ben-Ami Yassour committed
7
8
 * Copyright (C) 2008 Qumranet, Inc.
 * Copyright IBM Corporation, 2008
9
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
10
11
12
13
 *
 * Authors:
 *   Avi Kivity   <avi@qumranet.com>
 *   Yaniv Kamay  <yaniv@qumranet.com>
Ben-Ami Yassour's avatar
Ben-Ami Yassour committed
14
15
 *   Amit Shah    <amit.shah@qumranet.com>
 *   Ben-Ami Yassour <benami@il.ibm.com>
16
17
18
19
20
21
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

22
#include <linux/kvm_host.h>
23
#include "irq.h"
24
#include "mmu.h"
Sheng Yang's avatar
Sheng Yang committed
25
#include "i8254.h"
26
#include "tss.h"
27
#include "kvm_cache_regs.h"
28
#include "x86.h"
Avi Kivity's avatar
Avi Kivity committed
29
#include "cpuid.h"
30

31
#include <linux/clocksource.h>
Ben-Ami Yassour's avatar
Ben-Ami Yassour committed
32
#include <linux/interrupt.h>
33
34
35
#include <linux/kvm.h>
#include <linux/fs.h>
#include <linux/vmalloc.h>
36
#include <linux/module.h>
37
#include <linux/mman.h>
38
#include <linux/highmem.h>
39
#include <linux/iommu.h>
40
#include <linux/intel-iommu.h>
41
#include <linux/cpufreq.h>
42
#include <linux/user-return-notifier.h>
43
#include <linux/srcu.h>
44
#include <linux/slab.h>
45
#include <linux/perf_event.h>
46
#include <linux/uaccess.h>
47
#include <linux/hash.h>
48
#include <linux/pci.h>
49
50
#include <linux/timekeeper_internal.h>
#include <linux/pvclock_gtod.h>
Avi Kivity's avatar
Avi Kivity committed
51
#include <trace/events/kvm.h>
Xiao Guangrong's avatar
Xiao Guangrong committed
52

53
54
#define CREATE_TRACE_POINTS
#include "trace.h"
55

56
#include <asm/debugreg.h>
57
#include <asm/msr.h>
58
#include <asm/desc.h>
Sheng Yang's avatar
Sheng Yang committed
59
#include <asm/mtrr.h>
Huang Ying's avatar
Huang Ying committed
60
#include <asm/mce.h>
61
#include <asm/i387.h>
62
#include <asm/fpu-internal.h> /* Ugh! */
Sheng Yang's avatar
Sheng Yang committed
63
#include <asm/xcr.h>
64
#include <asm/pvclock.h>
65
#include <asm/div64.h>
66

67
#define MAX_IO_MSRS 256
Huang Ying's avatar
Huang Ying committed
68
#define KVM_MAX_MCE_BANKS 32
69
#define KVM_MCE_CAP_SUPPORTED (MCG_CTL_P | MCG_SER_P)
Huang Ying's avatar
Huang Ying committed
70

71
72
73
#define emul_to_vcpu(ctxt) \
	container_of(ctxt, struct kvm_vcpu, arch.emulate_ctxt)

74
75
76
77
78
/* EFER defaults:
 * - enable syscall per default because its emulated by KVM
 * - enable LME and LMA per default on 64 bit KVM
 */
#ifdef CONFIG_X86_64
79
80
static
u64 __read_mostly efer_reserved_bits = ~((u64)(EFER_SCE | EFER_LME | EFER_LMA));
81
#else
82
static u64 __read_mostly efer_reserved_bits = ~((u64)EFER_SCE);
83
#endif
84

85
86
#define VM_STAT(x) offsetof(struct kvm, stat.x), KVM_STAT_VM
#define VCPU_STAT(x) offsetof(struct kvm_vcpu, stat.x), KVM_STAT_VCPU
87

88
static void update_cr8_intercept(struct kvm_vcpu *vcpu);
Avi Kivity's avatar
Avi Kivity committed
89
static void process_nmi(struct kvm_vcpu *vcpu);
90

91
struct kvm_x86_ops *kvm_x86_ops;
92
EXPORT_SYMBOL_GPL(kvm_x86_ops);
93

94
95
static bool ignore_msrs = 0;
module_param(ignore_msrs, bool, S_IRUGO | S_IWUSR);
96

97
98
99
100
101
bool kvm_has_tsc_control;
EXPORT_SYMBOL_GPL(kvm_has_tsc_control);
u32  kvm_max_guest_tsc_khz;
EXPORT_SYMBOL_GPL(kvm_max_guest_tsc_khz);

102
103
104
105
/* tsc tolerance in parts per million - default to 1/2 of the NTP threshold */
static u32 tsc_tolerance_ppm = 250;
module_param(tsc_tolerance_ppm, uint, S_IRUGO | S_IWUSR);

106
107
108
109
#define KVM_NR_SHARED_MSRS 16

struct kvm_shared_msrs_global {
	int nr;
110
	u32 msrs[KVM_NR_SHARED_MSRS];
111
112
113
114
115
};

struct kvm_shared_msrs {
	struct user_return_notifier urn;
	bool registered;
116
117
118
119
	struct kvm_shared_msr_values {
		u64 host;
		u64 curr;
	} values[KVM_NR_SHARED_MSRS];
120
121
122
};

static struct kvm_shared_msrs_global __read_mostly shared_msrs_global;
123
static struct kvm_shared_msrs __percpu *shared_msrs;
124

125
struct kvm_stats_debugfs_item debugfs_entries[] = {
126
127
128
129
130
131
132
133
134
	{ "pf_fixed", VCPU_STAT(pf_fixed) },
	{ "pf_guest", VCPU_STAT(pf_guest) },
	{ "tlb_flush", VCPU_STAT(tlb_flush) },
	{ "invlpg", VCPU_STAT(invlpg) },
	{ "exits", VCPU_STAT(exits) },
	{ "io_exits", VCPU_STAT(io_exits) },
	{ "mmio_exits", VCPU_STAT(mmio_exits) },
	{ "signal_exits", VCPU_STAT(signal_exits) },
	{ "irq_window", VCPU_STAT(irq_window_exits) },
135
	{ "nmi_window", VCPU_STAT(nmi_window_exits) },
136
137
	{ "halt_exits", VCPU_STAT(halt_exits) },
	{ "halt_wakeup", VCPU_STAT(halt_wakeup) },
138
	{ "hypercalls", VCPU_STAT(hypercalls) },
139
140
141
142
143
144
145
	{ "request_irq", VCPU_STAT(request_irq_exits) },
	{ "irq_exits", VCPU_STAT(irq_exits) },
	{ "host_state_reload", VCPU_STAT(host_state_reload) },
	{ "efer_reload", VCPU_STAT(efer_reload) },
	{ "fpu_reload", VCPU_STAT(fpu_reload) },
	{ "insn_emulation", VCPU_STAT(insn_emulation) },
	{ "insn_emulation_fail", VCPU_STAT(insn_emulation_fail) },
146
	{ "irq_injections", VCPU_STAT(irq_injections) },
147
	{ "nmi_injections", VCPU_STAT(nmi_injections) },
Avi Kivity's avatar
Avi Kivity committed
148
149
150
151
152
153
	{ "mmu_shadow_zapped", VM_STAT(mmu_shadow_zapped) },
	{ "mmu_pte_write", VM_STAT(mmu_pte_write) },
	{ "mmu_pte_updated", VM_STAT(mmu_pte_updated) },
	{ "mmu_pde_zapped", VM_STAT(mmu_pde_zapped) },
	{ "mmu_flooded", VM_STAT(mmu_flooded) },
	{ "mmu_recycled", VM_STAT(mmu_recycled) },
154
	{ "mmu_cache_miss", VM_STAT(mmu_cache_miss) },
155
	{ "mmu_unsync", VM_STAT(mmu_unsync) },
156
	{ "remote_tlb_flush", VM_STAT(remote_tlb_flush) },
Marcelo Tosatti's avatar
Marcelo Tosatti committed
157
	{ "largepages", VM_STAT(lpages) },
158
159
160
	{ NULL }
};

161
162
u64 __read_mostly host_xcr0;

163
static int emulator_fix_hypercall(struct x86_emulate_ctxt *ctxt);
164

165
166
167
168
169
170
171
static inline void kvm_async_pf_hash_reset(struct kvm_vcpu *vcpu)
{
	int i;
	for (i = 0; i < roundup_pow_of_two(ASYNC_PF_PER_VCPU); i++)
		vcpu->arch.apf.gfns[i] = ~0;
}

172
173
174
175
176
static void kvm_on_user_return(struct user_return_notifier *urn)
{
	unsigned slot;
	struct kvm_shared_msrs *locals
		= container_of(urn, struct kvm_shared_msrs, urn);
177
	struct kvm_shared_msr_values *values;
178
179

	for (slot = 0; slot < shared_msrs_global.nr; ++slot) {
180
181
182
183
		values = &locals->values[slot];
		if (values->host != values->curr) {
			wrmsrl(shared_msrs_global.msrs[slot], values->host);
			values->curr = values->host;
184
185
186
187
188
189
		}
	}
	locals->registered = false;
	user_return_notifier_unregister(urn);
}

190
static void shared_msr_update(unsigned slot, u32 msr)
191
192
{
	u64 value;
193
194
	unsigned int cpu = smp_processor_id();
	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
195

196
197
198
199
200
201
202
203
204
205
206
207
208
	/* only read, and nobody should modify it at this time,
	 * so don't need lock */
	if (slot >= shared_msrs_global.nr) {
		printk(KERN_ERR "kvm: invalid MSR slot!");
		return;
	}
	rdmsrl_safe(msr, &value);
	smsr->values[slot].host = value;
	smsr->values[slot].curr = value;
}

void kvm_define_shared_msr(unsigned slot, u32 msr)
{
209
210
	if (slot >= shared_msrs_global.nr)
		shared_msrs_global.nr = slot + 1;
211
212
213
	shared_msrs_global.msrs[slot] = msr;
	/* we need ensured the shared_msr_global have been updated */
	smp_wmb();
214
215
216
217
218
219
220
221
}
EXPORT_SYMBOL_GPL(kvm_define_shared_msr);

static void kvm_shared_msr_cpu_online(void)
{
	unsigned i;

	for (i = 0; i < shared_msrs_global.nr; ++i)
222
		shared_msr_update(i, shared_msrs_global.msrs[i]);
223
224
}

225
void kvm_set_shared_msr(unsigned slot, u64 value, u64 mask)
226
{
227
228
	unsigned int cpu = smp_processor_id();
	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
229

230
	if (((value ^ smsr->values[slot].curr) & mask) == 0)
231
		return;
232
233
	smsr->values[slot].curr = value;
	wrmsrl(shared_msrs_global.msrs[slot], value);
234
235
236
237
238
239
240
241
	if (!smsr->registered) {
		smsr->urn.on_user_return = kvm_on_user_return;
		user_return_notifier_register(&smsr->urn);
		smsr->registered = true;
	}
}
EXPORT_SYMBOL_GPL(kvm_set_shared_msr);

242
243
static void drop_user_return_notifiers(void *ignore)
{
244
245
	unsigned int cpu = smp_processor_id();
	struct kvm_shared_msrs *smsr = per_cpu_ptr(shared_msrs, cpu);
246
247
248
249
250

	if (smsr->registered)
		kvm_on_user_return(&smsr->urn);
}

251
252
u64 kvm_get_apic_base(struct kvm_vcpu *vcpu)
{
253
	return vcpu->arch.apic_base;
254
255
256
257
258
259
}
EXPORT_SYMBOL_GPL(kvm_get_apic_base);

void kvm_set_apic_base(struct kvm_vcpu *vcpu, u64 data)
{
	/* TODO: reserve bits check */
260
	kvm_lapic_set_base(vcpu, data);
261
262
263
}
EXPORT_SYMBOL_GPL(kvm_set_apic_base);

264
265
266
267
268
269
270
asmlinkage void kvm_spurious_fault(void)
{
	/* Fault while not rebooting.  We want the trace. */
	BUG();
}
EXPORT_SYMBOL_GPL(kvm_spurious_fault);

271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
#define EXCPT_BENIGN		0
#define EXCPT_CONTRIBUTORY	1
#define EXCPT_PF		2

static int exception_class(int vector)
{
	switch (vector) {
	case PF_VECTOR:
		return EXCPT_PF;
	case DE_VECTOR:
	case TS_VECTOR:
	case NP_VECTOR:
	case SS_VECTOR:
	case GP_VECTOR:
		return EXCPT_CONTRIBUTORY;
	default:
		break;
	}
	return EXCPT_BENIGN;
}

static void kvm_multiple_exception(struct kvm_vcpu *vcpu,
293
294
		unsigned nr, bool has_error, u32 error_code,
		bool reinject)
295
296
297
298
{
	u32 prev_nr;
	int class1, class2;

299
300
	kvm_make_request(KVM_REQ_EVENT, vcpu);

301
302
303
304
305
306
	if (!vcpu->arch.exception.pending) {
	queue:
		vcpu->arch.exception.pending = true;
		vcpu->arch.exception.has_error_code = has_error;
		vcpu->arch.exception.nr = nr;
		vcpu->arch.exception.error_code = error_code;
307
		vcpu->arch.exception.reinject = reinject;
308
309
310
311
312
313
314
		return;
	}

	/* to check exception */
	prev_nr = vcpu->arch.exception.nr;
	if (prev_nr == DF_VECTOR) {
		/* triple fault -> shutdown */
315
		kvm_make_request(KVM_REQ_TRIPLE_FAULT, vcpu);
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
		return;
	}
	class1 = exception_class(prev_nr);
	class2 = exception_class(nr);
	if ((class1 == EXCPT_CONTRIBUTORY && class2 == EXCPT_CONTRIBUTORY)
		|| (class1 == EXCPT_PF && class2 != EXCPT_BENIGN)) {
		/* generate double fault per SDM Table 5-5 */
		vcpu->arch.exception.pending = true;
		vcpu->arch.exception.has_error_code = true;
		vcpu->arch.exception.nr = DF_VECTOR;
		vcpu->arch.exception.error_code = 0;
	} else
		/* replace previous exception with a new one in a hope
		   that instruction re-execution will regenerate lost
		   exception */
		goto queue;
}

334
335
void kvm_queue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
336
	kvm_multiple_exception(vcpu, nr, false, 0, false);
337
338
339
}
EXPORT_SYMBOL_GPL(kvm_queue_exception);

340
341
342
343
344
345
void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr)
{
	kvm_multiple_exception(vcpu, nr, false, 0, true);
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception);

346
void kvm_complete_insn_gp(struct kvm_vcpu *vcpu, int err)
347
{
348
349
350
351
352
353
	if (err)
		kvm_inject_gp(vcpu, 0);
	else
		kvm_x86_ops->skip_emulated_instruction(vcpu);
}
EXPORT_SYMBOL_GPL(kvm_complete_insn_gp);
354

355
void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
356
357
{
	++vcpu->stat.pf_guest;
358
359
	vcpu->arch.cr2 = fault->address;
	kvm_queue_exception_e(vcpu, PF_VECTOR, fault->error_code);
360
}
Nadav Har'El's avatar
Nadav Har'El committed
361
EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
362

363
void kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault)
364
{
365
366
	if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
		vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
367
	else
368
		vcpu->arch.mmu.inject_page_fault(vcpu, fault);
369
370
}

371
372
void kvm_inject_nmi(struct kvm_vcpu *vcpu)
{
Avi Kivity's avatar
Avi Kivity committed
373
374
	atomic_inc(&vcpu->arch.nmi_queued);
	kvm_make_request(KVM_REQ_NMI, vcpu);
375
376
377
}
EXPORT_SYMBOL_GPL(kvm_inject_nmi);

378
379
void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
380
	kvm_multiple_exception(vcpu, nr, true, error_code, false);
381
382
383
}
EXPORT_SYMBOL_GPL(kvm_queue_exception_e);

384
385
386
387
388
389
void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code)
{
	kvm_multiple_exception(vcpu, nr, true, error_code, true);
}
EXPORT_SYMBOL_GPL(kvm_requeue_exception_e);

390
391
392
393
394
/*
 * Checks if cpl <= required_cpl; if true, return true.  Otherwise queue
 * a #GP and return false.
 */
bool kvm_require_cpl(struct kvm_vcpu *vcpu, int required_cpl)
395
{
396
397
398
399
	if (kvm_x86_ops->get_cpl(vcpu) <= required_cpl)
		return true;
	kvm_queue_exception_e(vcpu, GP_VECTOR, 0);
	return false;
400
}
401
EXPORT_SYMBOL_GPL(kvm_require_cpl);
402

403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
/*
 * This function will be used to read from the physical memory of the currently
 * running guest. The difference to kvm_read_guest_page is that this function
 * can read from guest physical or from the guest's guest physical memory.
 */
int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
			    gfn_t ngfn, void *data, int offset, int len,
			    u32 access)
{
	gfn_t real_gfn;
	gpa_t ngpa;

	ngpa     = gfn_to_gpa(ngfn);
	real_gfn = mmu->translate_gpa(vcpu, ngpa, access);
	if (real_gfn == UNMAPPED_GVA)
		return -EFAULT;

	real_gfn = gpa_to_gfn(real_gfn);

	return kvm_read_guest_page(vcpu->kvm, real_gfn, data, offset, len);
}
EXPORT_SYMBOL_GPL(kvm_read_guest_page_mmu);

426
427
428
429
430
431
432
int kvm_read_nested_guest_page(struct kvm_vcpu *vcpu, gfn_t gfn,
			       void *data, int offset, int len, u32 access)
{
	return kvm_read_guest_page_mmu(vcpu, vcpu->arch.walk_mmu, gfn,
				       data, offset, len, access);
}

433
434
435
/*
 * Load the pae pdptrs.  Return true is they are all valid.
 */
436
int load_pdptrs(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, unsigned long cr3)
437
438
439
440
441
{
	gfn_t pdpt_gfn = cr3 >> PAGE_SHIFT;
	unsigned offset = ((cr3 & (PAGE_SIZE-1)) >> 5) << 2;
	int i;
	int ret;
442
	u64 pdpte[ARRAY_SIZE(mmu->pdptrs)];
443

444
445
446
	ret = kvm_read_guest_page_mmu(vcpu, mmu, pdpt_gfn, pdpte,
				      offset * sizeof(u64), sizeof(pdpte),
				      PFERR_USER_MASK|PFERR_WRITE_MASK);
447
448
449
450
451
	if (ret < 0) {
		ret = 0;
		goto out;
	}
	for (i = 0; i < ARRAY_SIZE(pdpte); ++i) {
452
		if (is_present_gpte(pdpte[i]) &&
453
		    (pdpte[i] & vcpu->arch.mmu.rsvd_bits_mask[0][2])) {
454
455
456
457
458
459
			ret = 0;
			goto out;
		}
	}
	ret = 1;

460
	memcpy(mmu->pdptrs, pdpte, sizeof(mmu->pdptrs));
Avi Kivity's avatar
Avi Kivity committed
461
462
463
464
	__set_bit(VCPU_EXREG_PDPTR,
		  (unsigned long *)&vcpu->arch.regs_avail);
	__set_bit(VCPU_EXREG_PDPTR,
		  (unsigned long *)&vcpu->arch.regs_dirty);
465
466
467
468
out:

	return ret;
}
469
EXPORT_SYMBOL_GPL(load_pdptrs);
470

471
472
static bool pdptrs_changed(struct kvm_vcpu *vcpu)
{
473
	u64 pdpte[ARRAY_SIZE(vcpu->arch.walk_mmu->pdptrs)];
474
	bool changed = true;
475
476
	int offset;
	gfn_t gfn;
477
478
479
480
481
	int r;

	if (is_long_mode(vcpu) || !is_pae(vcpu))
		return false;

Avi Kivity's avatar
Avi Kivity committed
482
483
484
485
	if (!test_bit(VCPU_EXREG_PDPTR,
		      (unsigned long *)&vcpu->arch.regs_avail))
		return true;

486
487
	gfn = (kvm_read_cr3(vcpu) & ~31u) >> PAGE_SHIFT;
	offset = (kvm_read_cr3(vcpu) & ~31u) & (PAGE_SIZE - 1);
488
489
	r = kvm_read_nested_guest_page(vcpu, gfn, pdpte, offset, sizeof(pdpte),
				       PFERR_USER_MASK | PFERR_WRITE_MASK);
490
491
	if (r < 0)
		goto out;
492
	changed = memcmp(pdpte, vcpu->arch.walk_mmu->pdptrs, sizeof(pdpte)) != 0;
493
494
495
496
497
out:

	return changed;
}

498
int kvm_set_cr0(struct kvm_vcpu *vcpu, unsigned long cr0)
499
{
500
501
502
503
	unsigned long old_cr0 = kvm_read_cr0(vcpu);
	unsigned long update_bits = X86_CR0_PG | X86_CR0_WP |
				    X86_CR0_CD | X86_CR0_NW;

504
505
	cr0 |= X86_CR0_ET;

506
#ifdef CONFIG_X86_64
507
508
	if (cr0 & 0xffffffff00000000UL)
		return 1;
509
510
511
#endif

	cr0 &= ~CR0_RESERVED_BITS;
512

513
514
	if ((cr0 & X86_CR0_NW) && !(cr0 & X86_CR0_CD))
		return 1;
515

516
517
	if ((cr0 & X86_CR0_PG) && !(cr0 & X86_CR0_PE))
		return 1;
518
519
520

	if (!is_paging(vcpu) && (cr0 & X86_CR0_PG)) {
#ifdef CONFIG_X86_64
521
		if ((vcpu->arch.efer & EFER_LME)) {
522
523
			int cs_db, cs_l;

524
525
			if (!is_pae(vcpu))
				return 1;
526
			kvm_x86_ops->get_cs_db_l_bits(vcpu, &cs_db, &cs_l);
527
528
			if (cs_l)
				return 1;
529
530
		} else
#endif
531
		if (is_pae(vcpu) && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
532
						 kvm_read_cr3(vcpu)))
533
			return 1;
534
535
	}

536
537
538
	if (!(cr0 & X86_CR0_PG) && kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE))
		return 1;

539
540
	kvm_x86_ops->set_cr0(vcpu, cr0);

541
	if ((cr0 ^ old_cr0) & X86_CR0_PG) {
542
		kvm_clear_async_pf_completion_queue(vcpu);
543
544
		kvm_async_pf_hash_reset(vcpu);
	}
545

546
547
	if ((cr0 ^ old_cr0) & update_bits)
		kvm_mmu_reset_context(vcpu);
548
549
	return 0;
}
550
EXPORT_SYMBOL_GPL(kvm_set_cr0);
551

552
void kvm_lmsw(struct kvm_vcpu *vcpu, unsigned long msw)
553
{
554
	(void)kvm_set_cr0(vcpu, kvm_read_cr0_bits(vcpu, ~0x0eul) | (msw & 0x0f));
555
}
556
EXPORT_SYMBOL_GPL(kvm_lmsw);
557

558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
int __kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
	u64 xcr0;

	/* Only support XCR_XFEATURE_ENABLED_MASK(xcr0) now  */
	if (index != XCR_XFEATURE_ENABLED_MASK)
		return 1;
	xcr0 = xcr;
	if (kvm_x86_ops->get_cpl(vcpu) != 0)
		return 1;
	if (!(xcr0 & XSTATE_FP))
		return 1;
	if ((xcr0 & XSTATE_YMM) && !(xcr0 & XSTATE_SSE))
		return 1;
	if (xcr0 & ~host_xcr0)
		return 1;
	vcpu->arch.xcr0 = xcr0;
	vcpu->guest_xcr0_loaded = 0;
	return 0;
}

int kvm_set_xcr(struct kvm_vcpu *vcpu, u32 index, u64 xcr)
{
	if (__kvm_set_xcr(vcpu, index, xcr)) {
		kvm_inject_gp(vcpu, 0);
		return 1;
	}
	return 0;
}
EXPORT_SYMBOL_GPL(kvm_set_xcr);

589
int kvm_set_cr4(struct kvm_vcpu *vcpu, unsigned long cr4)
590
{
591
	unsigned long old_cr4 = kvm_read_cr4(vcpu);
592
593
	unsigned long pdptr_bits = X86_CR4_PGE | X86_CR4_PSE |
				   X86_CR4_PAE | X86_CR4_SMEP;
594
595
	if (cr4 & CR4_RESERVED_BITS)
		return 1;
596

597
598
599
	if (!guest_cpuid_has_xsave(vcpu) && (cr4 & X86_CR4_OSXSAVE))
		return 1;

600
601
602
	if (!guest_cpuid_has_smep(vcpu) && (cr4 & X86_CR4_SMEP))
		return 1;

603
604
605
	if (!guest_cpuid_has_fsgsbase(vcpu) && (cr4 & X86_CR4_RDWRGSFS))
		return 1;

606
	if (is_long_mode(vcpu)) {
607
608
		if (!(cr4 & X86_CR4_PAE))
			return 1;
609
610
	} else if (is_paging(vcpu) && (cr4 & X86_CR4_PAE)
		   && ((cr4 ^ old_cr4) & pdptr_bits)
611
612
		   && !load_pdptrs(vcpu, vcpu->arch.walk_mmu,
				   kvm_read_cr3(vcpu)))
613
614
		return 1;

615
616
617
618
619
620
621
622
623
	if ((cr4 & X86_CR4_PCIDE) && !(old_cr4 & X86_CR4_PCIDE)) {
		if (!guest_cpuid_has_pcid(vcpu))
			return 1;

		/* PCID can not be enabled when cr3[11:0]!=000H or EFER.LMA=0 */
		if ((kvm_read_cr3(vcpu) & X86_CR3_PCID_MASK) || !is_long_mode(vcpu))
			return 1;
	}

624
	if (kvm_x86_ops->set_cr4(vcpu, cr4))
625
		return 1;
626

627
628
	if (((cr4 ^ old_cr4) & pdptr_bits) ||
	    (!(cr4 & X86_CR4_PCIDE) && (old_cr4 & X86_CR4_PCIDE)))
629
		kvm_mmu_reset_context(vcpu);
630

631
	if ((cr4 ^ old_cr4) & X86_CR4_OSXSAVE)
Avi Kivity's avatar
Avi Kivity committed
632
		kvm_update_cpuid(vcpu);
633

634
635
	return 0;
}
636
EXPORT_SYMBOL_GPL(kvm_set_cr4);
637

638
int kvm_set_cr3(struct kvm_vcpu *vcpu, unsigned long cr3)
639
{
640
	if (cr3 == kvm_read_cr3(vcpu) && !pdptrs_changed(vcpu)) {
641
		kvm_mmu_sync_roots(vcpu);
642
		kvm_mmu_flush_tlb(vcpu);
643
		return 0;
644
645
	}

646
	if (is_long_mode(vcpu)) {
647
		if (kvm_read_cr4_bits(vcpu, X86_CR4_PCIDE)) {
648
649
650
651
652
			if (cr3 & CR3_PCID_ENABLED_RESERVED_BITS)
				return 1;
		} else
			if (cr3 & CR3_L_MODE_RESERVED_BITS)
				return 1;
653
654
	} else {
		if (is_pae(vcpu)) {
655
656
			if (cr3 & CR3_PAE_RESERVED_BITS)
				return 1;
657
658
			if (is_paging(vcpu) &&
			    !load_pdptrs(vcpu, vcpu->arch.walk_mmu, cr3))
659
				return 1;
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
		}
		/*
		 * We don't check reserved bits in nonpae mode, because
		 * this isn't enforced, and VMware depends on this.
		 */
	}

	/*
	 * Does the new cr3 value map to physical memory? (Note, we
	 * catch an invalid cr3 even in real-mode, because it would
	 * cause trouble later on when we turn on paging anyway.)
	 *
	 * A real CPU would silently accept an invalid cr3 and would
	 * attempt to use it - with largely undefined (and often hard
	 * to debug) behavior on the guest side.
	 */
	if (unlikely(!gfn_to_memslot(vcpu->kvm, cr3 >> PAGE_SHIFT)))
677
678
		return 1;
	vcpu->arch.cr3 = cr3;
679
	__set_bit(VCPU_EXREG_CR3, (ulong *)&vcpu->arch.regs_avail);
680
681
682
	vcpu->arch.mmu.new_cr3(vcpu);
	return 0;
}
683
EXPORT_SYMBOL_GPL(kvm_set_cr3);
684

Andre Przywara's avatar
Andre Przywara committed
685
int kvm_set_cr8(struct kvm_vcpu *vcpu, unsigned long cr8)
686
{
687
688
	if (cr8 & CR8_RESERVED_BITS)
		return 1;
689
690
691
	if (irqchip_in_kernel(vcpu->kvm))
		kvm_lapic_set_tpr(vcpu, cr8);
	else
692
		vcpu->arch.cr8 = cr8;
693
694
	return 0;
}
695
EXPORT_SYMBOL_GPL(kvm_set_cr8);
696

697
unsigned long kvm_get_cr8(struct kvm_vcpu *vcpu)
698
699
700
701
{
	if (irqchip_in_kernel(vcpu->kvm))
		return kvm_lapic_get_cr8(vcpu);
	else
702
		return vcpu->arch.cr8;
703
}
704
EXPORT_SYMBOL_GPL(kvm_get_cr8);
705

706
707
708
709
710
711
712
713
714
715
716
717
static void kvm_update_dr7(struct kvm_vcpu *vcpu)
{
	unsigned long dr7;

	if (vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP)
		dr7 = vcpu->arch.guest_debug_dr7;
	else
		dr7 = vcpu->arch.dr7;
	kvm_x86_ops->set_dr7(vcpu, dr7);
	vcpu->arch.switch_db_regs = (dr7 & DR7_BP_EN_MASK);
}

718
static int __kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
719
720
721
722
723
724
725
726
{
	switch (dr) {
	case 0 ... 3:
		vcpu->arch.db[dr] = val;
		if (!(vcpu->guest_debug & KVM_GUESTDBG_USE_HW_BP))
			vcpu->arch.eff_db[dr] = val;
		break;
	case 4:
727
728
		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
			return 1; /* #UD */
729
730
		/* fall through */
	case 6:
731
732
		if (val & 0xffffffff00000000ULL)
			return -1; /* #GP */
733
734
735
		vcpu->arch.dr6 = (val & DR6_VOLATILE) | DR6_FIXED_1;
		break;
	case 5:
736
737
		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
			return 1; /* #UD */
738
739
		/* fall through */
	default: /* 7 */
740
741
		if (val & 0xffffffff00000000ULL)
			return -1; /* #GP */
742
		vcpu->arch.dr7 = (val & DR7_VOLATILE) | DR7_FIXED_1;
743
		kvm_update_dr7(vcpu);
744
745
746
747
748
		break;
	}

	return 0;
}
749
750
751
752
753
754
755
756
757
758
759
760
761

int kvm_set_dr(struct kvm_vcpu *vcpu, int dr, unsigned long val)
{
	int res;

	res = __kvm_set_dr(vcpu, dr, val);
	if (res > 0)
		kvm_queue_exception(vcpu, UD_VECTOR);
	else if (res < 0)
		kvm_inject_gp(vcpu, 0);

	return res;
}
762
763
EXPORT_SYMBOL_GPL(kvm_set_dr);

764
static int _kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
765
766
767
768
769
770
{
	switch (dr) {
	case 0 ... 3:
		*val = vcpu->arch.db[dr];
		break;
	case 4:
771
		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
772
773
774
775
776
777
			return 1;
		/* fall through */
	case 6:
		*val = vcpu->arch.dr6;
		break;
	case 5:
778
		if (kvm_read_cr4_bits(vcpu, X86_CR4_DE))
779
780
781
782
783
784
785
786
787
			return 1;
		/* fall through */
	default: /* 7 */
		*val = vcpu->arch.dr7;
		break;
	}

	return 0;
}
788
789
790
791
792
793
794
795
796

int kvm_get_dr(struct kvm_vcpu *vcpu, int dr, unsigned long *val)
{
	if (_kvm_get_dr(vcpu, dr, val)) {
		kvm_queue_exception(vcpu, UD_VECTOR);
		return 1;
	}
	return 0;
}
797
798
EXPORT_SYMBOL_GPL(kvm_get_dr);

Avi Kivity's avatar
Avi Kivity committed
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
bool kvm_rdpmc(struct kvm_vcpu *vcpu)
{
	u32 ecx = kvm_register_read(vcpu, VCPU_REGS_RCX);
	u64 data;
	int err;

	err = kvm_pmu_read_pmc(vcpu, ecx, &data);
	if (err)
		return err;
	kvm_register_write(vcpu, VCPU_REGS_RAX, (u32)data);
	kvm_register_write(vcpu, VCPU_REGS_RDX, data >> 32);
	return err;
}
EXPORT_SYMBOL_GPL(kvm_rdpmc);

814
815
816
817
818
/*
 * List of msr numbers which we expose to userspace through KVM_GET_MSRS
 * and KVM_SET_MSRS, and KVM_GET_MSR_INDEX_LIST.
 *
 * This list is modified at module load time to reflect the
819
820
 * capabilities of the host cpu. This capabilities test skips MSRs that are
 * kvm-specific. Those are put in the beginning of the list.
821
 */
822

823
#define KVM_SAVE_MSRS_BEGIN	10
824
static u32 msrs_to_save[] = {
825
	MSR_KVM_SYSTEM_TIME, MSR_KVM_WALL_CLOCK,
826
	MSR_KVM_SYSTEM_TIME_NEW, MSR_KVM_WALL_CLOCK_NEW,
827
	HV_X64_MSR_GUEST_OS_ID, HV_X64_MSR_HYPERCALL,
Glauber Costa's avatar
Glauber Costa committed
828
	HV_X64_MSR_APIC_ASSIST_PAGE, MSR_KVM_ASYNC_PF_EN, MSR_KVM_STEAL_TIME,
829
	MSR_KVM_PV_EOI_EN,
830
	MSR_IA32_SYSENTER_CS, MSR_IA32_SYSENTER_ESP, MSR_IA32_SYSENTER_EIP,
Brian Gerst's avatar
Brian Gerst committed
831
	MSR_STAR,
832
833
834
#ifdef CONFIG_X86_64
	MSR_CSTAR, MSR_KERNEL_GS_BASE, MSR_SYSCALL_MASK, MSR_LSTAR,
#endif
835
	MSR_IA32_TSC, MSR_IA32_CR_PAT, MSR_VM_HSAVE_PA
836
837
838
839
};

static unsigned num_msrs_to_save;

Mathias Krause's avatar
Mathias Krause committed
840
static const u32 emulated_msrs[] = {
841
	MSR_IA32_TSC_ADJUST,
842
	MSR_IA32_TSCDEADLINE,
843
	MSR_IA32_MISC_ENABLE,
844
845
	MSR_IA32_MCG_STATUS,
	MSR_IA32_MCG_CTL,
846
847
};

848
static int set_efer(struct kvm_vcpu *vcpu, u64 efer)
849
{
850
851
	u64 old_efer = vcpu->arch.efer;

852
853
	if (efer & efer_reserved_bits)
		return 1;
854
855

	if (is_paging(vcpu)
856
857
	    && (vcpu->arch.efer & EFER_LME) != (efer & EFER_LME))
		return 1;
858

Alexander Graf's avatar
Alexander Graf committed
859
860
861
862
	if (efer & EFER_FFXSR) {
		struct kvm_cpuid_entry2 *feat;

		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
863
864
		if (!feat || !(feat->edx & bit(X86_FEATURE_FXSR_OPT)))
			return 1;
Alexander Graf's avatar
Alexander Graf committed
865
866
	}

867
868
869
870
	if (efer & EFER_SVME) {
		struct kvm_cpuid_entry2 *feat;

		feat = kvm_find_cpuid_entry(vcpu, 0x80000001, 0);
871
872
		if (!feat || !(feat->ecx & bit(X86_FEATURE_SVM)))
			return 1;
873
874
	}

875
	efer &= ~EFER_LMA;
876
	efer |= vcpu->arch.efer & EFER_LMA;
877

878
879
	kvm_x86_ops->set_efer(vcpu, efer);

880
881
882
883
	/* Update reserved bits */
	if ((efer ^ old_efer) & EFER_NX)
		kvm_mmu_reset_context(vcpu);

884
	return 0;
885
886
}

887
888
889
890
891
892
893
void kvm_enable_efer_bits(u64 mask)
{
       efer_reserved_bits &= ~mask;
}
EXPORT_SYMBOL_GPL(kvm_enable_efer_bits);


894
895
896
897
898
/*
 * Writes msr value into into the appropriate "register".
 * Returns 0 on success, non-0 otherwise.
 * Assumes vcpu_load() was already called.
 */
899
int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr)
900
{
901
	return kvm_x86_ops->set_msr(vcpu, msr);
902
903
}

904
905
906
907
908
/*
 * Adapt set_msr() to msr_io()'s calling convention
 */
static int do_set_msr(struct kvm_vcpu *vcpu, unsigned index, u64 *data)
{
909
910
911
912
913
914
	struct msr_data msr;

	msr.data = *data;
	msr.index = index;
	msr.host_initiated = true;
	return kvm_set_msr(vcpu, &msr);
915
916
}

917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
#ifdef CONFIG_X86_64
struct pvclock_gtod_data {
	seqcount_t	seq;

	struct { /* extract of a clocksource struct */
		int vclock_mode;
		cycle_t	cycle_last;
		cycle_t	mask;
		u32	mult;
		u32	shift;
	} clock;

	/* open coded 'struct timespec' */
	u64		monotonic_time_snsec;
	time_t		monotonic_time_sec;
};

static struct pvclock_gtod_data pvclock_gtod_data;

static void update_pvclock_gtod(struct timekeeper *tk)
{
	struct pvclock_gtod_data *vdata = &pvclock_gtod_data;

	write_seqcount_begin(&vdata->seq);

	/* copy pvclock gtod data */
	vdata->clock.vclock_mode	= tk->clock->archdata.vclock_mode;
	vdata->clock.cycle_last		= tk->clock->cycle_last;
	vdata->clock.mask		= tk->clock->mask;
	vdata->clock.mult		= tk->mult;
	vdata->clock.shift		= tk->shift;

	vdata->monotonic_time_sec	= tk->xtime_sec
					+ tk->wall_to_monotonic.tv_sec;
	vdata->monotonic_time_snsec	= tk->xtime_nsec
					+ (tk->wall_to_monotonic.tv_nsec
						<< tk->shift);
	while (vdata->monotonic_time_snsec >=
					(((u64)NSEC_PER_SEC) << tk->shift)) {
		vdata->monotonic_time_snsec -=
					((u64)NSEC_PER_SEC) << tk->shift;
		vdata->monotonic_time_sec++;
	}

	write_seqcount_end(&vdata->seq);
}
#endif


966
967
static void kvm_write_wall_clock(struct kvm *kvm, gpa_t wall_clock)
{
968
969
	int version;
	int r;
970
	struct pvclock_wall_clock wc;
971
	struct timespec boot;
972
973
974
975

	if (!wall_clock)
		return;

976
977
978
979
980
981
982
983
	r = kvm_read_guest(kvm, wall_clock, &version, sizeof(version));
	if (r)
		return;

	if (version & 1)
		++version;  /* first time write, random junk */

	++version;
984
985
986

	kvm_write_guest(kvm, wall_clock, &version, sizeof(version));

987
988
	/*
	 * The guest calculates current wall clock time by adding
989
	 * system time (updated by kvm_guest_time_update below) to the
990
991
992
	 * wall clock specified here.  guest system time equals host
	 * system time for us, thus we must fill in host boot time here.
	 */
993
	getboottime(&boot);
994

995
996
997
998
	if (kvm->arch.kvmclock_offset) {
		struct timespec ts = ns_to_timespec(kvm->arch.kvmclock_offset);
		boot = timespec_sub(boot, ts);
	}
999
1000
	wc.sec = boot.tv_sec;
	wc.nsec = boot.tv_nsec;