kvm_main.c 84.6 KB
Newer Older
Avi Kivity's avatar
Avi Kivity committed
1
2
3
4
5
6
7
/*
 * Kernel-based Virtual Machine driver for Linux
 *
 * This module enables machines with Intel VT-x extensions to run virtual
 * machines without emulation or binary translation.
 *
 * Copyright (C) 2006 Qumranet, Inc.
8
 * Copyright 2010 Red Hat, Inc. and/or its affiliates.
Avi Kivity's avatar
Avi Kivity committed
9
10
11
12
13
14
15
16
17
18
 *
 * Authors:
 *   Avi Kivity   <avi@qumranet.com>
 *   Yaniv Kamay  <yaniv@qumranet.com>
 *
 * This work is licensed under the terms of the GNU GPL, version 2.  See
 * the COPYING file in the top-level directory.
 *
 */

19
#include <kvm/iodev.h>
Avi Kivity's avatar
Avi Kivity committed
20

21
#include <linux/kvm_host.h>
Avi Kivity's avatar
Avi Kivity committed
22
23
24
25
26
27
28
29
30
31
32
#include <linux/kvm.h>
#include <linux/module.h>
#include <linux/errno.h>
#include <linux/percpu.h>
#include <linux/mm.h>
#include <linux/miscdevice.h>
#include <linux/vmalloc.h>
#include <linux/reboot.h>
#include <linux/debugfs.h>
#include <linux/highmem.h>
#include <linux/file.h>
33
#include <linux/syscore_ops.h>
Avi Kivity's avatar
Avi Kivity committed
34
#include <linux/cpu.h>
Alexey Dobriyan's avatar
Alexey Dobriyan committed
35
#include <linux/sched.h>
36
37
#include <linux/cpumask.h>
#include <linux/smp.h>
38
#include <linux/anon_inodes.h>
39
#include <linux/profile.h>
40
#include <linux/kvm_para.h>
41
#include <linux/pagemap.h>
42
#include <linux/mman.h>
43
#include <linux/swap.h>
44
#include <linux/bitops.h>
45
#include <linux/spinlock.h>
46
#include <linux/compat.h>
47
#include <linux/srcu.h>
48
#include <linux/hugetlb.h>
49
#include <linux/slab.h>
50
51
#include <linux/sort.h>
#include <linux/bsearch.h>
Avi Kivity's avatar
Avi Kivity committed
52

Avi Kivity's avatar
Avi Kivity committed
53
54
#include <asm/processor.h>
#include <asm/io.h>
55
#include <asm/ioctl.h>
Avi Kivity's avatar
Avi Kivity committed
56
#include <asm/uaccess.h>
57
#include <asm/pgtable.h>
Avi Kivity's avatar
Avi Kivity committed
58

59
#include "coalesced_mmio.h"
60
#include "async_pf.h"
61
#include "vfio.h"
62

63
64
65
#define CREATE_TRACE_POINTS
#include <trace/events/kvm.h>

Avi Kivity's avatar
Avi Kivity committed
66
67
68
MODULE_AUTHOR("Qumranet");
MODULE_LICENSE("GPL");

69
70
/* Architectures should define their poll value according to the halt latency */
static unsigned int halt_poll_ns = KVM_HALT_POLL_NS_DEFAULT;
71
72
module_param(halt_poll_ns, uint, S_IRUGO | S_IWUSR);

Wanpeng Li's avatar
Wanpeng Li committed
73
74
75
76
77
78
79
80
/* Default doubles per-vcpu halt_poll_ns. */
static unsigned int halt_poll_ns_grow = 2;
module_param(halt_poll_ns_grow, int, S_IRUGO);

/* Default resets per-vcpu halt_poll_ns . */
static unsigned int halt_poll_ns_shrink;
module_param(halt_poll_ns_shrink, int, S_IRUGO);

81
82
83
/*
 * Ordering of locks:
 *
84
 *	kvm->lock --> kvm->slots_lock --> kvm->irq_lock
85
86
 */

87
DEFINE_SPINLOCK(kvm_lock);
88
static DEFINE_RAW_SPINLOCK(kvm_count_lock);
89
LIST_HEAD(vm_list);
90

91
static cpumask_var_t cpus_hardware_enabled;
92
static int kvm_usage_count;
93
static atomic_t hardware_enable_failed;
94

95
96
struct kmem_cache *kvm_vcpu_cache;
EXPORT_SYMBOL_GPL(kvm_vcpu_cache);
Avi Kivity's avatar
Avi Kivity committed
97

98
99
static __read_mostly struct preempt_ops kvm_preempt_ops;

100
struct dentry *kvm_debugfs_dir;
101
EXPORT_SYMBOL_GPL(kvm_debugfs_dir);
Avi Kivity's avatar
Avi Kivity committed
102

Avi Kivity's avatar
Avi Kivity committed
103
104
static long kvm_vcpu_ioctl(struct file *file, unsigned int ioctl,
			   unsigned long arg);
105
#ifdef CONFIG_KVM_COMPAT
106
107
108
static long kvm_vcpu_compat_ioctl(struct file *file, unsigned int ioctl,
				  unsigned long arg);
#endif
109
110
static int hardware_enable_all(void);
static void hardware_disable_all(void);
Avi Kivity's avatar
Avi Kivity committed
111

Marcelo Tosatti's avatar
Marcelo Tosatti committed
112
static void kvm_io_bus_destroy(struct kvm_io_bus *bus);
113
114

static void kvm_release_pfn_dirty(pfn_t pfn);
115
static void mark_page_dirty_in_slot(struct kvm_memory_slot *memslot, gfn_t gfn);
Marcelo Tosatti's avatar
Marcelo Tosatti committed
116

117
__visible bool kvm_rebooting;
118
EXPORT_SYMBOL_GPL(kvm_rebooting);
119

120
121
static bool largepages_enabled = true;

122
bool kvm_is_reserved_pfn(pfn_t pfn)
Ben-Ami Yassour's avatar
Ben-Ami Yassour committed
123
{
124
	if (pfn_valid(pfn))
125
		return PageReserved(pfn_to_page(pfn));
Ben-Ami Yassour's avatar
Ben-Ami Yassour committed
126
127
128
129

	return true;
}

Avi Kivity's avatar
Avi Kivity committed
130
131
132
/*
 * Switches to specified vcpu, until a matching vcpu_put()
 */
133
int vcpu_load(struct kvm_vcpu *vcpu)
Avi Kivity's avatar
Avi Kivity committed
134
{
135
136
	int cpu;

137
138
	if (mutex_lock_killable(&vcpu->mutex))
		return -EINTR;
139
140
	cpu = get_cpu();
	preempt_notifier_register(&vcpu->preempt_notifier);
141
	kvm_arch_vcpu_load(vcpu, cpu);
142
	put_cpu();
143
	return 0;
Avi Kivity's avatar
Avi Kivity committed
144
}
145
EXPORT_SYMBOL_GPL(vcpu_load);
Avi Kivity's avatar
Avi Kivity committed
146

147
void vcpu_put(struct kvm_vcpu *vcpu)
Avi Kivity's avatar
Avi Kivity committed
148
{
149
	preempt_disable();
150
	kvm_arch_vcpu_put(vcpu);
151
152
	preempt_notifier_unregister(&vcpu->preempt_notifier);
	preempt_enable();
Avi Kivity's avatar
Avi Kivity committed
153
154
	mutex_unlock(&vcpu->mutex);
}
155
EXPORT_SYMBOL_GPL(vcpu_put);
Avi Kivity's avatar
Avi Kivity committed
156

157
158
159
160
static void ack_flush(void *_completed)
{
}

161
bool kvm_make_all_cpus_request(struct kvm *kvm, unsigned int req)
162
{
163
	int i, cpu, me;
164
165
	cpumask_var_t cpus;
	bool called = true;
166
167
	struct kvm_vcpu *vcpu;

168
	zalloc_cpumask_var(&cpus, GFP_ATOMIC);
169

170
	me = get_cpu();
171
	kvm_for_each_vcpu(i, vcpu, kvm) {
172
		kvm_make_request(req, vcpu);
173
		cpu = vcpu->cpu;
174
175
176
177
178
179

		/* Set ->requests bit before we read ->mode */
		smp_mb();

		if (cpus != NULL && cpu != -1 && cpu != me &&
		      kvm_vcpu_exiting_guest_mode(vcpu) != OUTSIDE_GUEST_MODE)
180
			cpumask_set_cpu(cpu, cpus);
181
	}
182
183
184
185
186
187
	if (unlikely(cpus == NULL))
		smp_call_function_many(cpu_online_mask, ack_flush, NULL, 1);
	else if (!cpumask_empty(cpus))
		smp_call_function_many(cpus, ack_flush, NULL, 1);
	else
		called = false;
188
	put_cpu();
189
	free_cpumask_var(cpus);
190
	return called;
191
192
}

193
#ifndef CONFIG_HAVE_KVM_ARCH_TLB_FLUSH_ALL
194
void kvm_flush_remote_tlbs(struct kvm *kvm)
195
{
196
197
198
	long dirty_count = kvm->tlbs_dirty;

	smp_mb();
199
	if (kvm_make_all_cpus_request(kvm, KVM_REQ_TLB_FLUSH))
200
		++kvm->stat.remote_tlb_flush;
201
	cmpxchg(&kvm->tlbs_dirty, dirty_count, 0);
202
}
203
EXPORT_SYMBOL_GPL(kvm_flush_remote_tlbs);
204
#endif
205

206
207
void kvm_reload_remote_mmus(struct kvm *kvm)
{
208
	kvm_make_all_cpus_request(kvm, KVM_REQ_MMU_RELOAD);
209
}
210

211
212
void kvm_make_mclock_inprogress_request(struct kvm *kvm)
{
213
	kvm_make_all_cpus_request(kvm, KVM_REQ_MCLOCK_INPROGRESS);
214
215
}

216
void kvm_make_scan_ioapic_request(struct kvm *kvm)
217
{
218
	kvm_make_all_cpus_request(kvm, KVM_REQ_SCAN_IOAPIC);
219
220
}

221
222
223
224
225
226
227
228
229
int kvm_vcpu_init(struct kvm_vcpu *vcpu, struct kvm *kvm, unsigned id)
{
	struct page *page;
	int r;

	mutex_init(&vcpu->mutex);
	vcpu->cpu = -1;
	vcpu->kvm = kvm;
	vcpu->vcpu_id = id;
230
	vcpu->pid = NULL;
Wanpeng Li's avatar
Wanpeng Li committed
231
	vcpu->halt_poll_ns = 0;
Eddie Dong's avatar
Eddie Dong committed
232
	init_waitqueue_head(&vcpu->wq);
233
	kvm_async_pf_vcpu_init(vcpu);
234

235
236
237
	vcpu->pre_pcpu = -1;
	INIT_LIST_HEAD(&vcpu->blocked_vcpu_list);

238
239
240
241
242
243
244
	page = alloc_page(GFP_KERNEL | __GFP_ZERO);
	if (!page) {
		r = -ENOMEM;
		goto fail;
	}
	vcpu->run = page_address(page);

245
246
	kvm_vcpu_set_in_spin_loop(vcpu, false);
	kvm_vcpu_set_dy_eligible(vcpu, false);
247
	vcpu->preempted = false;
248

249
	r = kvm_arch_vcpu_init(vcpu);
250
	if (r < 0)
251
		goto fail_free_run;
252
253
254
255
256
	return 0;

fail_free_run:
	free_page((unsigned long)vcpu->run);
fail:
257
	return r;
258
259
260
261
262
}
EXPORT_SYMBOL_GPL(kvm_vcpu_init);

void kvm_vcpu_uninit(struct kvm_vcpu *vcpu)
{
263
	put_pid(vcpu->pid);
264
	kvm_arch_vcpu_uninit(vcpu);
265
266
267
268
	free_page((unsigned long)vcpu->run);
}
EXPORT_SYMBOL_GPL(kvm_vcpu_uninit);

269
270
271
272
273
274
275
276
277
278
279
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
static inline struct kvm *mmu_notifier_to_kvm(struct mmu_notifier *mn)
{
	return container_of(mn, struct kvm, mmu_notifier);
}

static void kvm_mmu_notifier_invalidate_page(struct mmu_notifier *mn,
					     struct mm_struct *mm,
					     unsigned long address)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);
280
	int need_tlb_flush, idx;
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299

	/*
	 * When ->invalidate_page runs, the linux pte has been zapped
	 * already but the page is still allocated until
	 * ->invalidate_page returns. So if we increase the sequence
	 * here the kvm page fault will notice if the spte can't be
	 * established because the page is going to be freed. If
	 * instead the kvm page fault establishes the spte before
	 * ->invalidate_page runs, kvm_unmap_hva will release it
	 * before returning.
	 *
	 * The sequence increase only need to be seen at spin_unlock
	 * time, and not at spin_lock time.
	 *
	 * Increasing the sequence after the spin_unlock would be
	 * unsafe because the kvm page fault could then establish the
	 * pte after kvm_unmap_hva returned, without noticing the page
	 * is going to be freed.
	 */
300
	idx = srcu_read_lock(&kvm->srcu);
301
	spin_lock(&kvm->mmu_lock);
302

303
	kvm->mmu_notifier_seq++;
304
	need_tlb_flush = kvm_unmap_hva(kvm, address) | kvm->tlbs_dirty;
305
306
307
308
	/* we've to flush the tlb before the pages can be freed */
	if (need_tlb_flush)
		kvm_flush_remote_tlbs(kvm);

309
	spin_unlock(&kvm->mmu_lock);
310
311
312

	kvm_arch_mmu_notifier_invalidate_page(kvm, address);

313
	srcu_read_unlock(&kvm->srcu, idx);
314
315
}

316
317
318
319
320
321
static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
					struct mm_struct *mm,
					unsigned long address,
					pte_t pte)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);
322
	int idx;
323

324
	idx = srcu_read_lock(&kvm->srcu);
325
326
327
328
	spin_lock(&kvm->mmu_lock);
	kvm->mmu_notifier_seq++;
	kvm_set_spte_hva(kvm, address, pte);
	spin_unlock(&kvm->mmu_lock);
329
	srcu_read_unlock(&kvm->srcu, idx);
330
331
}

332
333
334
335
336
337
static void kvm_mmu_notifier_invalidate_range_start(struct mmu_notifier *mn,
						    struct mm_struct *mm,
						    unsigned long start,
						    unsigned long end)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);
338
	int need_tlb_flush = 0, idx;
339

340
	idx = srcu_read_lock(&kvm->srcu);
341
342
343
344
345
346
347
	spin_lock(&kvm->mmu_lock);
	/*
	 * The count increase must become visible at unlock time as no
	 * spte can be established without taking the mmu_lock and
	 * count is also read inside the mmu_lock critical section.
	 */
	kvm->mmu_notifier_count++;
348
	need_tlb_flush = kvm_unmap_hva_range(kvm, start, end);
349
	need_tlb_flush |= kvm->tlbs_dirty;
350
351
352
	/* we've to flush the tlb before the pages can be freed */
	if (need_tlb_flush)
		kvm_flush_remote_tlbs(kvm);
353
354
355

	spin_unlock(&kvm->mmu_lock);
	srcu_read_unlock(&kvm->srcu, idx);
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
}

static void kvm_mmu_notifier_invalidate_range_end(struct mmu_notifier *mn,
						  struct mm_struct *mm,
						  unsigned long start,
						  unsigned long end)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);

	spin_lock(&kvm->mmu_lock);
	/*
	 * This sequence increase will notify the kvm page fault that
	 * the page that is going to be mapped in the spte could have
	 * been freed.
	 */
	kvm->mmu_notifier_seq++;
372
	smp_wmb();
373
374
	/*
	 * The above sequence increase must be visible before the
375
376
	 * below count decrease, which is ensured by the smp_wmb above
	 * in conjunction with the smp_rmb in mmu_notifier_retry().
377
378
379
380
381
382
383
384
385
	 */
	kvm->mmu_notifier_count--;
	spin_unlock(&kvm->mmu_lock);

	BUG_ON(kvm->mmu_notifier_count < 0);
}

static int kvm_mmu_notifier_clear_flush_young(struct mmu_notifier *mn,
					      struct mm_struct *mm,
386
387
					      unsigned long start,
					      unsigned long end)
388
389
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);
390
	int young, idx;
391

392
	idx = srcu_read_lock(&kvm->srcu);
393
394
	spin_lock(&kvm->mmu_lock);

395
	young = kvm_age_hva(kvm, start, end);
396
397
398
	if (young)
		kvm_flush_remote_tlbs(kvm);

399
400
401
	spin_unlock(&kvm->mmu_lock);
	srcu_read_unlock(&kvm->srcu, idx);

402
403
404
	return young;
}

405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
static int kvm_mmu_notifier_clear_young(struct mmu_notifier *mn,
					struct mm_struct *mm,
					unsigned long start,
					unsigned long end)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);
	int young, idx;

	idx = srcu_read_lock(&kvm->srcu);
	spin_lock(&kvm->mmu_lock);
	/*
	 * Even though we do not flush TLB, this will still adversely
	 * affect performance on pre-Haswell Intel EPT, where there is
	 * no EPT Access Bit to clear so that we have to tear down EPT
	 * tables instead. If we find this unacceptable, we can always
	 * add a parameter to kvm_age_hva so that it effectively doesn't
	 * do anything on clear_young.
	 *
	 * Also note that currently we never issue secondary TLB flushes
	 * from clear_young, leaving this job up to the regular system
	 * cadence. If we find this inaccurate, we might come up with a
	 * more sophisticated heuristic later.
	 */
	young = kvm_age_hva(kvm, start, end);
	spin_unlock(&kvm->mmu_lock);
	srcu_read_unlock(&kvm->srcu, idx);

	return young;
}

435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
static int kvm_mmu_notifier_test_young(struct mmu_notifier *mn,
				       struct mm_struct *mm,
				       unsigned long address)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);
	int young, idx;

	idx = srcu_read_lock(&kvm->srcu);
	spin_lock(&kvm->mmu_lock);
	young = kvm_test_age_hva(kvm, address);
	spin_unlock(&kvm->mmu_lock);
	srcu_read_unlock(&kvm->srcu, idx);

	return young;
}

451
452
453
454
static void kvm_mmu_notifier_release(struct mmu_notifier *mn,
				     struct mm_struct *mm)
{
	struct kvm *kvm = mmu_notifier_to_kvm(mn);
455
456
457
	int idx;

	idx = srcu_read_lock(&kvm->srcu);
458
	kvm_arch_flush_shadow_all(kvm);
459
	srcu_read_unlock(&kvm->srcu, idx);
460
461
}

462
463
464
465
466
static const struct mmu_notifier_ops kvm_mmu_notifier_ops = {
	.invalidate_page	= kvm_mmu_notifier_invalidate_page,
	.invalidate_range_start	= kvm_mmu_notifier_invalidate_range_start,
	.invalidate_range_end	= kvm_mmu_notifier_invalidate_range_end,
	.clear_flush_young	= kvm_mmu_notifier_clear_flush_young,
467
	.clear_young		= kvm_mmu_notifier_clear_young,
468
	.test_young		= kvm_mmu_notifier_test_young,
469
	.change_pte		= kvm_mmu_notifier_change_pte,
470
	.release		= kvm_mmu_notifier_release,
471
};
472
473
474
475
476
477
478
479
480
481
482
483
484
485

static int kvm_init_mmu_notifier(struct kvm *kvm)
{
	kvm->mmu_notifier.ops = &kvm_mmu_notifier_ops;
	return mmu_notifier_register(&kvm->mmu_notifier, current->mm);
}

#else  /* !(CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER) */

static int kvm_init_mmu_notifier(struct kvm *kvm)
{
	return 0;
}

486
487
#endif /* CONFIG_MMU_NOTIFIER && KVM_ARCH_WANT_MMU_NOTIFIER */

488
static struct kvm_memslots *kvm_alloc_memslots(void)
489
490
{
	int i;
491
	struct kvm_memslots *slots;
492

493
494
495
496
497
498
499
500
501
	slots = kvm_kvzalloc(sizeof(struct kvm_memslots));
	if (!slots)
		return NULL;

	/*
	 * Init kvm generation close to the maximum to easily test the
	 * code of handling generation number wrap-around.
	 */
	slots->generation = -150;
502
	for (i = 0; i < KVM_MEM_SLOTS_NUM; i++)
503
		slots->id_to_index[i] = slots->memslots[i].id = i;
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541

	return slots;
}

static void kvm_destroy_dirty_bitmap(struct kvm_memory_slot *memslot)
{
	if (!memslot->dirty_bitmap)
		return;

	kvfree(memslot->dirty_bitmap);
	memslot->dirty_bitmap = NULL;
}

/*
 * Free any memory in @free but not in @dont.
 */
static void kvm_free_memslot(struct kvm *kvm, struct kvm_memory_slot *free,
			      struct kvm_memory_slot *dont)
{
	if (!dont || free->dirty_bitmap != dont->dirty_bitmap)
		kvm_destroy_dirty_bitmap(free);

	kvm_arch_free_memslot(kvm, free, dont);

	free->npages = 0;
}

static void kvm_free_memslots(struct kvm *kvm, struct kvm_memslots *slots)
{
	struct kvm_memory_slot *memslot;

	if (!slots)
		return;

	kvm_for_each_memslot(memslot, slots)
		kvm_free_memslot(kvm, memslot, NULL);

	kvfree(slots);
542
543
}

544
static struct kvm *kvm_create_vm(unsigned long type)
Avi Kivity's avatar
Avi Kivity committed
545
{
546
547
	int r, i;
	struct kvm *kvm = kvm_arch_alloc_vm();
Avi Kivity's avatar
Avi Kivity committed
548

549
550
551
	if (!kvm)
		return ERR_PTR(-ENOMEM);

552
553
554
555
556
557
558
559
560
561
	spin_lock_init(&kvm->mmu_lock);
	atomic_inc(&current->mm->mm_count);
	kvm->mm = current->mm;
	kvm_eventfd_init(kvm);
	mutex_init(&kvm->lock);
	mutex_init(&kvm->irq_lock);
	mutex_init(&kvm->slots_lock);
	atomic_set(&kvm->users_count, 1);
	INIT_LIST_HEAD(&kvm->devices);

562
	r = kvm_arch_init_vm(kvm, type);
563
	if (r)
564
		goto out_err_no_disable;
565
566
567

	r = hardware_enable_all();
	if (r)
568
		goto out_err_no_disable;
569

570
#ifdef CONFIG_HAVE_KVM_IRQFD
571
	INIT_HLIST_HEAD(&kvm->irq_ack_notifier_list);
572
#endif
Avi Kivity's avatar
Avi Kivity committed
573

574
575
	BUILD_BUG_ON(KVM_MEM_SLOTS_NUM > SHRT_MAX);

576
	r = -ENOMEM;
577
578
579
580
581
	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++) {
		kvm->memslots[i] = kvm_alloc_memslots();
		if (!kvm->memslots[i])
			goto out_err_no_srcu;
	}
582

583
	if (init_srcu_struct(&kvm->srcu))
584
585
586
		goto out_err_no_srcu;
	if (init_srcu_struct(&kvm->irq_srcu))
		goto out_err_no_irq_srcu;
Marcelo Tosatti's avatar
Marcelo Tosatti committed
587
588
589
	for (i = 0; i < KVM_NR_BUSES; i++) {
		kvm->buses[i] = kzalloc(sizeof(struct kvm_io_bus),
					GFP_KERNEL);
590
		if (!kvm->buses[i])
Marcelo Tosatti's avatar
Marcelo Tosatti committed
591
592
			goto out_err;
	}
593

594
595
596
597
	r = kvm_init_mmu_notifier(kvm);
	if (r)
		goto out_err;

598
	spin_lock(&kvm_lock);
599
	list_add(&kvm->vm_list, &vm_list);
600
	spin_unlock(&kvm_lock);
601

602
603
	preempt_notifier_inc();

604
	return kvm;
605
606

out_err:
607
608
	cleanup_srcu_struct(&kvm->irq_srcu);
out_err_no_irq_srcu:
609
	cleanup_srcu_struct(&kvm->srcu);
610
out_err_no_srcu:
611
	hardware_disable_all();
612
out_err_no_disable:
Marcelo Tosatti's avatar
Marcelo Tosatti committed
613
614
	for (i = 0; i < KVM_NR_BUSES; i++)
		kfree(kvm->buses[i]);
615
616
	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
		kvm_free_memslots(kvm, kvm->memslots[i]);
617
	kvm_arch_free_vm(kvm);
618
	mmdrop(current->mm);
619
	return ERR_PTR(r);
620
621
}

622
623
624
625
/*
 * Avoid using vmalloc for a small buffer.
 * Should not be used when the size is statically known.
 */
626
void *kvm_kvzalloc(unsigned long size)
627
628
629
630
631
632
633
{
	if (size > PAGE_SIZE)
		return vzalloc(size);
	else
		return kzalloc(size, GFP_KERNEL);
}

634
635
636
637
638
639
640
641
642
643
644
645
646
static void kvm_destroy_devices(struct kvm *kvm)
{
	struct list_head *node, *tmp;

	list_for_each_safe(node, tmp, &kvm->devices) {
		struct kvm_device *dev =
			list_entry(node, struct kvm_device, vm_node);

		list_del(node);
		dev->ops->destroy(dev);
	}
}

647
648
static void kvm_destroy_vm(struct kvm *kvm)
{
Marcelo Tosatti's avatar
Marcelo Tosatti committed
649
	int i;
650
651
	struct mm_struct *mm = kvm->mm;

652
	kvm_arch_sync_events(kvm);
653
	spin_lock(&kvm_lock);
654
	list_del(&kvm->vm_list);
655
	spin_unlock(&kvm_lock);
656
	kvm_free_irq_routing(kvm);
657
	for (i = 0; i < KVM_NR_BUSES; i++) {
658
659
		if (kvm->buses[i])
			kvm_io_bus_destroy(kvm->buses[i]);
660
661
		kvm->buses[i] = NULL;
	}
662
	kvm_coalesced_mmio_free(kvm);
663
664
#if defined(CONFIG_MMU_NOTIFIER) && defined(KVM_ARCH_WANT_MMU_NOTIFIER)
	mmu_notifier_unregister(&kvm->mmu_notifier, kvm->mm);
665
#else
666
	kvm_arch_flush_shadow_all(kvm);
667
#endif
668
	kvm_arch_destroy_vm(kvm);
669
	kvm_destroy_devices(kvm);
670
671
	for (i = 0; i < KVM_ADDRESS_SPACE_NUM; i++)
		kvm_free_memslots(kvm, kvm->memslots[i]);
672
	cleanup_srcu_struct(&kvm->irq_srcu);
673
674
	cleanup_srcu_struct(&kvm->srcu);
	kvm_arch_free_vm(kvm);
675
	preempt_notifier_dec();
676
	hardware_disable_all();
677
	mmdrop(mm);
678
679
}

Izik Eidus's avatar
Izik Eidus committed
680
681
682
683
684
685
686
687
688
689
690
691
692
693
void kvm_get_kvm(struct kvm *kvm)
{
	atomic_inc(&kvm->users_count);
}
EXPORT_SYMBOL_GPL(kvm_get_kvm);

void kvm_put_kvm(struct kvm *kvm)
{
	if (atomic_dec_and_test(&kvm->users_count))
		kvm_destroy_vm(kvm);
}
EXPORT_SYMBOL_GPL(kvm_put_kvm);


694
695
696
697
static int kvm_vm_release(struct inode *inode, struct file *filp)
{
	struct kvm *kvm = filp->private_data;

Gregory Haskins's avatar
Gregory Haskins committed
698
699
	kvm_irqfd_release(kvm);

Izik Eidus's avatar
Izik Eidus committed
700
	kvm_put_kvm(kvm);
Avi Kivity's avatar
Avi Kivity committed
701
702
703
	return 0;
}

704
705
/*
 * Allocation size is twice as large as the actual dirty bitmap size.
706
 * See x86's kvm_vm_ioctl_get_dirty_log() why this is needed.
707
 */
708
709
static int kvm_create_dirty_bitmap(struct kvm_memory_slot *memslot)
{
710
	unsigned long dirty_bytes = 2 * kvm_dirty_bitmap_bytes(memslot);
711

712
	memslot->dirty_bitmap = kvm_kvzalloc(dirty_bytes);
713
714
715
716
717
718
	if (!memslot->dirty_bitmap)
		return -ENOMEM;

	return 0;
}

719
/*
720
721
722
723
 * Insert memslot and re-sort memslots based on their GFN,
 * so binary search could be used to lookup GFN.
 * Sorting algorithm takes advantage of having initially
 * sorted array and known changed memslot position.
724
 */
725
726
static void update_memslots(struct kvm_memslots *slots,
			    struct kvm_memory_slot *new)
727
{
728
729
	int id = new->id;
	int i = slots->id_to_index[id];
730
	struct kvm_memory_slot *mslots = slots->memslots;
731

732
	WARN_ON(mslots[i].id != id);
733
	if (!new->npages) {
734
		WARN_ON(!mslots[i].npages);
735
736
737
738
739
740
		if (mslots[i].npages)
			slots->used_slots--;
	} else {
		if (!mslots[i].npages)
			slots->used_slots++;
	}
741

742
	while (i < KVM_MEM_SLOTS_NUM - 1 &&
743
744
745
	       new->base_gfn <= mslots[i + 1].base_gfn) {
		if (!mslots[i + 1].npages)
			break;
746
747
748
749
		mslots[i] = mslots[i + 1];
		slots->id_to_index[mslots[i].id] = i;
		i++;
	}
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766

	/*
	 * The ">=" is needed when creating a slot with base_gfn == 0,
	 * so that it moves before all those with base_gfn == npages == 0.
	 *
	 * On the other hand, if new->npages is zero, the above loop has
	 * already left i pointing to the beginning of the empty part of
	 * mslots, and the ">=" would move the hole backwards in this
	 * case---which is wrong.  So skip the loop when deleting a slot.
	 */
	if (new->npages) {
		while (i > 0 &&
		       new->base_gfn >= mslots[i - 1].base_gfn) {
			mslots[i] = mslots[i - 1];
			slots->id_to_index[mslots[i].id] = i;
			i--;
		}
767
768
	} else
		WARN_ON_ONCE(i != slots->used_slots);
769

770
771
	mslots[i] = *new;
	slots->id_to_index[mslots[i].id] = i;
772
773
}

774
static int check_memory_region_flags(const struct kvm_userspace_memory_region *mem)
775
{
776
777
	u32 valid_flags = KVM_MEM_LOG_DIRTY_PAGES;

778
#ifdef __KVM_HAVE_READONLY_MEM
779
780
781
782
	valid_flags |= KVM_MEM_READONLY;
#endif

	if (mem->flags & ~valid_flags)
783
784
785
786
787
		return -EINVAL;

	return 0;
}

788
static struct kvm_memslots *install_new_memslots(struct kvm *kvm,
789
		int as_id, struct kvm_memslots *slots)
790
{
791
	struct kvm_memslots *old_memslots = __kvm_memslots(kvm, as_id);
792

793
794
795
796
797
798
799
	/*
	 * Set the low bit in the generation, which disables SPTE caching
	 * until the end of synchronize_srcu_expedited.
	 */
	WARN_ON(old_memslots->generation & 1);
	slots->generation = old_memslots->generation + 1;

800
	rcu_assign_pointer(kvm->memslots[as_id], slots);
801
	synchronize_srcu_expedited(&kvm->srcu);
802

803
804
805
806
807
808
809
	/*
	 * Increment the new memslot generation a second time. This prevents
	 * vm exits that race with memslot updates from caching a memslot
	 * generation that will (potentially) be valid forever.
	 */
	slots->generation++;

810
	kvm_arch_memslots_updated(kvm, slots);
811
812

	return old_memslots;
813
814
}

Avi Kivity's avatar
Avi Kivity committed
815
816
817
818
819
/*
 * Allocate some memory and give it an address in the guest physical address
 * space.
 *
 * Discontiguous memory is allowed, mostly for framebuffers.
820
 *
821
 * Must be called holding kvm->slots_lock for write.
Avi Kivity's avatar
Avi Kivity committed
822
 */
823
int __kvm_set_memory_region(struct kvm *kvm,
824
			    const struct kvm_userspace_memory_region *mem)
Avi Kivity's avatar
Avi Kivity committed
825
{
826
	int r;
Avi Kivity's avatar
Avi Kivity committed
827
	gfn_t base_gfn;
828
	unsigned long npages;
829
	struct kvm_memory_slot *slot;
Avi Kivity's avatar
Avi Kivity committed
830
	struct kvm_memory_slot old, new;
831
	struct kvm_memslots *slots = NULL, *old_memslots;
832
	int as_id, id;
833
	enum kvm_mr_change change;
Avi Kivity's avatar
Avi Kivity committed
834

835
836
837
838
	r = check_memory_region_flags(mem);
	if (r)
		goto out;

Avi Kivity's avatar
Avi Kivity committed
839
	r = -EINVAL;
840
841
842
	as_id = mem->slot >> 16;
	id = (u16)mem->slot;

Avi Kivity's avatar
Avi Kivity committed
843
844
845
846
847
	/* General sanity checks */
	if (mem->memory_size & (PAGE_SIZE - 1))
		goto out;
	if (mem->guest_phys_addr & (PAGE_SIZE - 1))
		goto out;
848
	/* We can read the guest memory with __xxx_user() later on. */
849
	if ((id < KVM_USER_MEM_SLOTS) &&
850
	    ((mem->userspace_addr & (PAGE_SIZE - 1)) ||
851
852
853
	     !access_ok(VERIFY_WRITE,
			(void __user *)(unsigned long)mem->userspace_addr,
			mem->memory_size)))
854
		goto out;
855
	if (as_id >= KVM_ADDRESS_SPACE_NUM || id >= KVM_MEM_SLOTS_NUM)
Avi Kivity's avatar
Avi Kivity committed
856
857
858
859
		goto out;
	if (mem->guest_phys_addr + mem->memory_size < mem->guest_phys_addr)
		goto out;

860
	slot = id_to_memslot(__kvm_memslots(kvm, as_id), id);
Avi Kivity's avatar
Avi Kivity committed
861
862
863
	base_gfn = mem->guest_phys_addr >> PAGE_SHIFT;
	npages = mem->memory_size >> PAGE_SHIFT;

864
865
866
	if (npages > KVM_MEM_MAX_NR_PAGES)
		goto out;

867
	new = old = *slot;
Avi Kivity's avatar
Avi Kivity committed
868

869
	new.id = id;
Avi Kivity's avatar
Avi Kivity committed
870
871
872
873
	new.base_gfn = base_gfn;
	new.npages = npages;
	new.flags = mem->flags;

874
875
876
877
878
	if (npages) {
		if (!old.npages)
			change = KVM_MR_CREATE;
		else { /* Modify an existing slot. */
			if ((mem->userspace_addr != old.userspace_addr) ||
879
880
			    (npages != old.npages) ||
			    ((new.flags ^ old.flags) & KVM_MEM_READONLY))
881
882
883
884
885
886
887
888
889
890
891
				goto out;

			if (base_gfn != old.base_gfn)
				change = KVM_MR_MOVE;
			else if (new.flags != old.flags)
				change = KVM_MR_FLAGS_ONLY;
			else { /* Nothing to change. */
				r = 0;
				goto out;
			}
		}
892
893
894
895
	} else {
		if (!old.npages)
			goto out;

896
		change = KVM_MR_DELETE;
897
898
899
		new.base_gfn = 0;
		new.flags = 0;
	}
Avi Kivity's avatar
Avi Kivity committed
900

901
	if ((change == KVM_MR_CREATE) || (change == KVM_MR_MOVE)) {
902
903
		/* Check for overlaps */
		r = -EEXIST;
904
		kvm_for_each_memslot(slot, __kvm_memslots(kvm, as_id)) {
905
			if (slot->id == id)
906
907
908
909
910
				continue;
			if (!((base_gfn + npages <= slot->base_gfn) ||
			      (base_gfn >= slot->base_gfn + slot->npages)))
				goto out;
		}
Avi Kivity's avatar
Avi Kivity committed
911
912
913
914
	}

	/* Free page dirty bitmap if unneeded */
	if (!(new.flags & KVM_MEM_LOG_DIRTY_PAGES))
Al Viro's avatar
Al Viro committed
915
		new.dirty_bitmap = NULL;
Avi Kivity's avatar
Avi Kivity committed
916
917

	r = -ENOMEM;
918
	if (change == KVM_MR_CREATE) {
919
		new.userspace_addr = mem->userspace_addr;
920

921
		if (kvm_arch_create_memslot(kvm, &new, npages))
922
			goto out_free;
Avi Kivity's avatar
Avi Kivity committed
923
	}
924

Avi Kivity's avatar
Avi Kivity committed
925
926
	/* Allocate page dirty bitmap if needed */
	if ((new.flags & KVM_MEM_LOG_DIRTY_PAGES) && !new.dirty_bitmap) {
927
		if (kvm_create_dirty_bitmap(&new) < 0)
928
			goto out_free;
Avi Kivity's avatar
Avi Kivity committed
929
930
	}

931
	slots = kvm_kvzalloc(sizeof(struct kvm_memslots));
932
933
	if (!slots)
		goto out_free;
934
	memcpy(slots, __kvm_memslots(kvm, as_id), sizeof(struct kvm_memslots));
935

936
	if ((change == KVM_MR_DELETE) || (change == KVM_MR_MOVE)) {
937
		slot = id_to_memslot(slots, id);
938
939
		slot->flags |= KVM_MEMSLOT_INVALID;

940
		old_memslots = install_new_memslots(kvm, as_id, slots);
941

942
943
		/* slot was deleted or moved, clear iommu mapping */
		kvm_iommu_unmap_pages(kvm, &old);
944
945
		/* From this point no new shadow pages pointing to a deleted,
		 * or moved, memslot will be created.
946
947
		 *
		 * validation of sp->gfn happens in:
948
949
		 *	- gfn_to_hva (kvm_read_guest, gfn_to_pfn)
		 *	- kvm_is_visible_gfn (mmu_check_roots)
950
		 */
951
		kvm_arch_flush_shadow_memslot(kvm, slot);
952
953
954
955
956
957

		/*
		 * We can re-use the old_memslots from above, the only difference
		 * from the currently installed memslots is the invalid flag.  This
		 * will get overwritten by update_memslots anyway.
		 */
958
		slots = old_memslots;
959
	}
960

961
	r = kvm_arch_prepare_memory_region(kvm, &new, mem, change);
962
	if (r)
963
		goto out_slots;
964

965
	/* actual memory is freed via old in kvm_free_memslot below */
966
	if (change == KVM_MR_DELETE) {
967
		new.dirty_bitmap = NULL;
968
		memset(&new.arch, 0, sizeof(new.arch));
969
970
	}

971
	update_memslots(slots, &new);
972
	old_memslots = install_new_memslots(kvm, as_id, slots);
973

974
	kvm_arch_commit_memory_region(kvm, mem, &old, &new, change);
975

976
	kvm_free_memslot(kvm, &old, &new);
977
	kvfree(old_memslots);
978

979
980
	/*
	 * IOMMU mapping:  New slots need to be mapped.  Old slots need to be
981
982
983
984
985
986
	 * un-mapped and re-mapped if their base changes.  Since base change
	 * unmapping is handled above with slot deletion, mapping alone is
	 * needed here.  Anything else the iommu might care about for existing
	 * slots (size changes, userspace addr changes and read-only flag
	 * changes) is disallowed above, so any other attribute changes getting
	 * here can be skipped.
987
	 */
988
	if (as_id == 0 && (change == KVM_MR_CREATE || change == KVM_MR_MOVE)) {
989
		r = kvm_iommu_map_pages(kvm, &new);
990
		return r;
991
992
	}

Avi Kivity's avatar
Avi Kivity committed
993
994
	return 0;

995
out_slots:
996
	kvfree(slots);
997
out_free:
998
	kvm_free_memslot(kvm, &new, &old);
Avi Kivity's avatar
Avi Kivity committed
999
1000
out:
	return r;
1001
}
1002
1003
1004
EXPORT_SYMBOL_GPL(__kvm_set_memory_region);

int kvm_set_memory_region(struct kvm *kvm,
1005
			  const struct kvm_userspace_memory_region *mem)