heartbeat.c 47.3 KB
Newer Older
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
/* -*- mode: c; c-basic-offset: 8; -*-
 * vim: noexpandtab sw=8 ts=8 sts=0:
 *
 * Copyright (C) 2004, 2005 Oracle.  All rights reserved.
 *
 * This program is free software; you can redistribute it and/or
 * modify it under the terms of the GNU General Public
 * License as published by the Free Software Foundation; either
 * version 2 of the License, or (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 * General Public License for more details.
 *
 * You should have received a copy of the GNU General Public
 * License along with this program; if not, write to the
 * Free Software Foundation, Inc., 59 Temple Place - Suite 330,
 * Boston, MA 021110-1307, USA.
 */

#include <linux/kernel.h>
#include <linux/sched.h>
#include <linux/jiffies.h>
#include <linux/module.h>
#include <linux/fs.h>
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/delay.h>
#include <linux/file.h>
#include <linux/kthread.h>
#include <linux/configfs.h>
#include <linux/random.h>
#include <linux/crc32.h>
#include <linux/time.h>

#include "heartbeat.h"
#include "tcp.h"
#include "nodemanager.h"
#include "quorum.h"

#include "masklog.h"


/*
 * The first heartbeat pass had one global thread that would serialize all hb
 * callback calls.  This global serializing sem should only be removed once
 * we've made sure that all callees can deal with being called concurrently
 * from multiple hb region threads.
 */
static DECLARE_RWSEM(o2hb_callback_sem);

/*
 * multiple hb threads are watching multiple regions.  A node is live
 * whenever any of the threads sees activity from the node in its region.
 */
57
static DEFINE_SPINLOCK(o2hb_live_lock);
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
static struct list_head o2hb_live_slots[O2NM_MAX_NODES];
static unsigned long o2hb_live_node_bitmap[BITS_TO_LONGS(O2NM_MAX_NODES)];
static LIST_HEAD(o2hb_node_events);
static DECLARE_WAIT_QUEUE_HEAD(o2hb_steady_queue);

static LIST_HEAD(o2hb_all_regions);

static struct o2hb_callback {
	struct list_head list;
} o2hb_callbacks[O2HB_NUM_CB];

static struct o2hb_callback *hbcall_from_type(enum o2hb_callback_type type);

#define O2HB_DEFAULT_BLOCK_BITS       9

unsigned int o2hb_dead_threshold = O2HB_DEFAULT_DEAD_THRESHOLD;

/* Only sets a new threshold if there are no active regions. 
 *
 * No locking or otherwise interesting code is required for reading
 * o2hb_dead_threshold as it can't change once regions are active and
 * it's not interesting to anyone until then anyway. */
static void o2hb_dead_threshold_set(unsigned int threshold)
{
	if (threshold > O2HB_MIN_DEAD_THRESHOLD) {
		spin_lock(&o2hb_live_lock);
		if (list_empty(&o2hb_all_regions))
			o2hb_dead_threshold = threshold;
		spin_unlock(&o2hb_live_lock);
	}
}

struct o2hb_node_event {
	struct list_head        hn_item;
	enum o2hb_callback_type hn_event_type;
	struct o2nm_node        *hn_node;
	int                     hn_node_num;
};

struct o2hb_disk_slot {
	struct o2hb_disk_heartbeat_block *ds_raw_block;
	u8			ds_node_num;
	u64			ds_last_time;
	u64			ds_last_generation;
	u16			ds_equal_samples;
	u16			ds_changed_samples;
	struct list_head	ds_live_item;
};

/* each thread owns a region.. when we're asked to tear down the region
 * we ask the thread to stop, who cleans up the region */
struct o2hb_region {
	struct config_item	hr_item;

	struct list_head	hr_all_item;
	unsigned		hr_unclean_stop:1;

	/* protected by the hr_callback_sem */
	struct task_struct 	*hr_task;

	unsigned int		hr_blocks;
	unsigned long long	hr_start_block;

	unsigned int		hr_block_bits;
	unsigned int		hr_block_bytes;

	unsigned int		hr_slots_per_page;
	unsigned int		hr_num_pages;

	struct page             **hr_slot_data;
	struct block_device	*hr_bdev;
	struct o2hb_disk_slot	*hr_slots;

	/* let the person setting up hb wait for it to return until it
	 * has reached a 'steady' state.  This will be fixed when we have
	 * a more complete api that doesn't lead to this sort of fragility. */
	atomic_t		hr_steady_iterations;

	char			hr_dev_name[BDEVNAME_SIZE];

	unsigned int		hr_timeout_ms;

	/* randomized as the region goes up and down so that a node
	 * recognizes a node going up and down in one iteration */
	u64			hr_generation;

David Howells's avatar
David Howells committed
144
	struct delayed_work	hr_write_timeout_work;
145
146
147
148
149
150
151
152
153
154
155
	unsigned long		hr_last_timeout_start;

	/* Used during o2hb_check_slot to hold a copy of the block
	 * being checked because we temporarily have to zero out the
	 * crc field. */
	struct o2hb_disk_heartbeat_block *hr_tmp_block;
};

struct o2hb_bio_wait_ctxt {
	atomic_t          wc_num_reqs;
	struct completion wc_io_complete;
156
	int               wc_error;
157
158
};

David Howells's avatar
David Howells committed
159
static void o2hb_write_timeout(struct work_struct *work)
160
{
David Howells's avatar
David Howells committed
161
162
163
	struct o2hb_region *reg =
		container_of(work, struct o2hb_region,
			     hr_write_timeout_work.work);
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186

	mlog(ML_ERROR, "Heartbeat write timeout to device %s after %u "
	     "milliseconds\n", reg->hr_dev_name,
	     jiffies_to_msecs(jiffies - reg->hr_last_timeout_start)); 
	o2quo_disk_timeout();
}

static void o2hb_arm_write_timeout(struct o2hb_region *reg)
{
	mlog(0, "Queue write timeout for %u ms\n", O2HB_MAX_WRITE_TIMEOUT_MS);

	cancel_delayed_work(&reg->hr_write_timeout_work);
	reg->hr_last_timeout_start = jiffies;
	schedule_delayed_work(&reg->hr_write_timeout_work,
			      msecs_to_jiffies(O2HB_MAX_WRITE_TIMEOUT_MS));
}

static void o2hb_disarm_write_timeout(struct o2hb_region *reg)
{
	cancel_delayed_work(&reg->hr_write_timeout_work);
	flush_scheduled_work();
}

187
static inline void o2hb_bio_wait_init(struct o2hb_bio_wait_ctxt *wc)
188
{
189
	atomic_set(&wc->wc_num_reqs, 1);
190
	init_completion(&wc->wc_io_complete);
191
	wc->wc_error = 0;
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
}

/* Used in error paths too */
static inline void o2hb_bio_wait_dec(struct o2hb_bio_wait_ctxt *wc,
				     unsigned int num)
{
	/* sadly atomic_sub_and_test() isn't available on all platforms.  The
	 * good news is that the fast path only completes one at a time */
	while(num--) {
		if (atomic_dec_and_test(&wc->wc_num_reqs)) {
			BUG_ON(num > 0);
			complete(&wc->wc_io_complete);
		}
	}
}

static void o2hb_wait_on_io(struct o2hb_region *reg,
			    struct o2hb_bio_wait_ctxt *wc)
{
	struct address_space *mapping = reg->hr_bdev->bd_inode->i_mapping;

	blk_run_address_space(mapping);
214
	o2hb_bio_wait_dec(wc, 1);
215
216
217
218

	wait_for_completion(&wc->wc_io_complete);
}

Al Viro's avatar
Al Viro committed
219
static void o2hb_bio_end_io(struct bio *bio,
220
221
222
223
			   int error)
{
	struct o2hb_bio_wait_ctxt *wc = bio->bi_private;

224
	if (error) {
225
		mlog(ML_ERROR, "IO Error %d\n", error);
226
227
		wc->wc_error = error;
	}
228
229

	o2hb_bio_wait_dec(wc, 1);
230
	bio_put(bio);
231
232
233
234
235
236
}

/* Setup a Bio to cover I/O against num_slots slots starting at
 * start_slot. */
static struct bio *o2hb_setup_one_bio(struct o2hb_region *reg,
				      struct o2hb_bio_wait_ctxt *wc,
237
238
				      unsigned int *current_slot,
				      unsigned int max_slots)
239
{
240
	int len, current_page;
241
242
243
	unsigned int vec_len, vec_start;
	unsigned int bits = reg->hr_block_bits;
	unsigned int spp = reg->hr_slots_per_page;
244
	unsigned int cs = *current_slot;
245
246
247
248
249
250
251
	struct bio *bio;
	struct page *page;

	/* Testing has shown this allocation to take long enough under
	 * GFP_KERNEL that the local node can get fenced. It would be
	 * nicest if we could pre-allocate these bios and avoid this
	 * all together. */
252
	bio = bio_alloc(GFP_ATOMIC, 16);
253
254
255
256
257
258
259
	if (!bio) {
		mlog(ML_ERROR, "Could not alloc slots BIO!\n");
		bio = ERR_PTR(-ENOMEM);
		goto bail;
	}

	/* Must put everything in 512 byte sectors for the bio... */
260
	bio->bi_sector = (reg->hr_start_block + cs) << (bits - 9);
261
262
263
264
	bio->bi_bdev = reg->hr_bdev;
	bio->bi_private = wc;
	bio->bi_end_io = o2hb_bio_end_io;

265
266
267
268
	vec_start = (cs << bits) % PAGE_CACHE_SIZE;
	while(cs < max_slots) {
		current_page = cs / spp;
		page = reg->hr_slot_data[current_page];
269

270
		vec_len = min(PAGE_CACHE_SIZE - vec_start,
271
			      (max_slots-cs) * (PAGE_CACHE_SIZE/spp) );
272
273

		mlog(ML_HB_BIO, "page %d, vec_len = %u, vec_start = %u\n",
274
		     current_page, vec_len, vec_start);
275
276

		len = bio_add_page(bio, page, vec_len, vec_start);
277
		if (len != vec_len) break;
278

279
		cs += vec_len / (PAGE_CACHE_SIZE/spp);
280
281
282
283
		vec_start = 0;
	}

bail:
284
	*current_slot = cs;
285
286
287
288
289
290
	return bio;
}

static int o2hb_read_slots(struct o2hb_region *reg,
			   unsigned int max_slots)
{
291
292
	unsigned int current_slot=0;
	int status;
293
294
295
	struct o2hb_bio_wait_ctxt wc;
	struct bio *bio;

296
	o2hb_bio_wait_init(&wc);
297

298
299
	while(current_slot < max_slots) {
		bio = o2hb_setup_one_bio(reg, &wc, &current_slot, max_slots);
300
301
302
303
304
305
		if (IS_ERR(bio)) {
			status = PTR_ERR(bio);
			mlog_errno(status);
			goto bail_and_wait;
		}

306
		atomic_inc(&wc.wc_num_reqs);
307
308
309
310
311
312
313
		submit_bio(READ, bio);
	}

	status = 0;

bail_and_wait:
	o2hb_wait_on_io(reg, &wc);
314
315
	if (wc.wc_error && !status)
		status = wc.wc_error;
316
317
318
319
320
321
322
323
324
325
326

	return status;
}

static int o2hb_issue_node_write(struct o2hb_region *reg,
				 struct o2hb_bio_wait_ctxt *write_wc)
{
	int status;
	unsigned int slot;
	struct bio *bio;

327
	o2hb_bio_wait_init(write_wc);
328
329
330

	slot = o2nm_this_node();

331
	bio = o2hb_setup_one_bio(reg, write_wc, &slot, slot+1);
332
333
334
335
336
337
	if (IS_ERR(bio)) {
		status = PTR_ERR(bio);
		mlog_errno(status);
		goto bail;
	}

338
	atomic_inc(&write_wc->wc_num_reqs);
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
	submit_bio(WRITE, bio);

	status = 0;
bail:
	return status;
}

static u32 o2hb_compute_block_crc_le(struct o2hb_region *reg,
				     struct o2hb_disk_heartbeat_block *hb_block)
{
	__le32 old_cksum;
	u32 ret;

	/* We want to compute the block crc with a 0 value in the
	 * hb_cksum field. Save it off here and replace after the
	 * crc. */
	old_cksum = hb_block->hb_cksum;
	hb_block->hb_cksum = 0;

	ret = crc32_le(0, (unsigned char *) hb_block, reg->hr_block_bytes);

	hb_block->hb_cksum = old_cksum;

	return ret;
}

static void o2hb_dump_slot(struct o2hb_disk_heartbeat_block *hb_block)
{
367
368
369
370
371
	mlog(ML_ERROR, "Dump slot information: seq = 0x%llx, node = %u, "
	     "cksum = 0x%x, generation 0x%llx\n",
	     (long long)le64_to_cpu(hb_block->hb_seq),
	     hb_block->hb_node, le32_to_cpu(hb_block->hb_cksum),
	     (long long)le64_to_cpu(hb_block->hb_generation));
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
}

static int o2hb_verify_crc(struct o2hb_region *reg,
			   struct o2hb_disk_heartbeat_block *hb_block)
{
	u32 read, computed;

	read = le32_to_cpu(hb_block->hb_cksum);
	computed = o2hb_compute_block_crc_le(reg, hb_block);

	return read == computed;
}

/* We want to make sure that nobody is heartbeating on top of us --
 * this will help detect an invalid configuration. */
static int o2hb_check_last_timestamp(struct o2hb_region *reg)
{
	int node_num, ret;
	struct o2hb_disk_slot *slot;
	struct o2hb_disk_heartbeat_block *hb_block;

	node_num = o2nm_this_node();

	ret = 1;
	slot = &reg->hr_slots[node_num];
	/* Don't check on our 1st timestamp */
	if (slot->ds_last_time) {
		hb_block = slot->ds_raw_block;

		if (le64_to_cpu(hb_block->hb_seq) != slot->ds_last_time)
			ret = 0;
	}

	return ret;
}

static inline void o2hb_prepare_block(struct o2hb_region *reg,
				      u64 generation)
{
	int node_num;
	u64 cputime;
	struct o2hb_disk_slot *slot;
	struct o2hb_disk_heartbeat_block *hb_block;

	node_num = o2nm_this_node();
	slot = &reg->hr_slots[node_num];

	hb_block = (struct o2hb_disk_heartbeat_block *)slot->ds_raw_block;
	memset(hb_block, 0, reg->hr_block_bytes);
	/* TODO: time stuff */
	cputime = CURRENT_TIME.tv_sec;
	if (!cputime)
		cputime = 1;

	hb_block->hb_seq = cpu_to_le64(cputime);
	hb_block->hb_node = node_num;
	hb_block->hb_generation = cpu_to_le64(generation);
429
	hb_block->hb_dead_ms = cpu_to_le32(o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS);
430
431
432
433
434

	/* This step must always happen last! */
	hb_block->hb_cksum = cpu_to_le32(o2hb_compute_block_crc_le(reg,
								   hb_block));

435
	mlog(ML_HB_BIO, "our node generation = 0x%llx, cksum = 0x%x\n",
436
	     (long long)generation,
437
	     le32_to_cpu(hb_block->hb_cksum));
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
}

static void o2hb_fire_callbacks(struct o2hb_callback *hbcall,
				struct o2nm_node *node,
				int idx)
{
	struct list_head *iter;
	struct o2hb_callback_func *f;

	list_for_each(iter, &hbcall->list) {
		f = list_entry(iter, struct o2hb_callback_func, hc_item);
		mlog(ML_HEARTBEAT, "calling funcs %p\n", f);
		(f->hc_func)(node, idx, f->hc_data);
	}
}

/* Will run the list in order until we process the passed event */
static void o2hb_run_event_list(struct o2hb_node_event *queued_event)
{
	int empty;
	struct o2hb_callback *hbcall;
	struct o2hb_node_event *event;

	spin_lock(&o2hb_live_lock);
	empty = list_empty(&queued_event->hn_item);
	spin_unlock(&o2hb_live_lock);
	if (empty)
		return;

	/* Holding callback sem assures we don't alter the callback
	 * lists when doing this, and serializes ourselves with other
	 * processes wanting callbacks. */
	down_write(&o2hb_callback_sem);

	spin_lock(&o2hb_live_lock);
	while (!list_empty(&o2hb_node_events)
	       && !list_empty(&queued_event->hn_item)) {
		event = list_entry(o2hb_node_events.next,
				   struct o2hb_node_event,
				   hn_item);
		list_del_init(&event->hn_item);
		spin_unlock(&o2hb_live_lock);

		mlog(ML_HEARTBEAT, "Node %s event for %d\n",
		     event->hn_event_type == O2HB_NODE_UP_CB ? "UP" : "DOWN",
		     event->hn_node_num);

		hbcall = hbcall_from_type(event->hn_event_type);

		/* We should *never* have gotten on to the list with a
		 * bad type... This isn't something that we should try
		 * to recover from. */
		BUG_ON(IS_ERR(hbcall));

		o2hb_fire_callbacks(hbcall, event->hn_node, event->hn_node_num);

		spin_lock(&o2hb_live_lock);
	}
	spin_unlock(&o2hb_live_lock);

	up_write(&o2hb_callback_sem);
}

static void o2hb_queue_node_event(struct o2hb_node_event *event,
				  enum o2hb_callback_type type,
				  struct o2nm_node *node,
				  int node_num)
{
	assert_spin_locked(&o2hb_live_lock);

	event->hn_event_type = type;
	event->hn_node = node;
	event->hn_node_num = node_num;

	mlog(ML_HEARTBEAT, "Queue node %s event for node %d\n",
	     type == O2HB_NODE_UP_CB ? "UP" : "DOWN", node_num);

	list_add_tail(&event->hn_item, &o2hb_node_events);
}

static void o2hb_shutdown_slot(struct o2hb_disk_slot *slot)
{
	struct o2hb_node_event event =
		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
	struct o2nm_node *node;

	node = o2nm_get_node_by_num(slot->ds_node_num);
	if (!node)
		return;

	spin_lock(&o2hb_live_lock);
	if (!list_empty(&slot->ds_live_item)) {
		mlog(ML_HEARTBEAT, "Shutdown, node %d leaves region\n",
		     slot->ds_node_num);

		list_del_init(&slot->ds_live_item);

		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);

			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
					      slot->ds_node_num);
		}
	}
	spin_unlock(&o2hb_live_lock);

	o2hb_run_event_list(&event);

	o2nm_node_put(node);
}

static int o2hb_check_slot(struct o2hb_region *reg,
			   struct o2hb_disk_slot *slot)
{
	int changed = 0, gen_changed = 0;
	struct o2hb_node_event event =
		{ .hn_item = LIST_HEAD_INIT(event.hn_item), };
	struct o2nm_node *node;
	struct o2hb_disk_heartbeat_block *hb_block = reg->hr_tmp_block;
	u64 cputime;
558
559
	unsigned int dead_ms = o2hb_dead_threshold * O2HB_REGION_TIMEOUT_MS;
	unsigned int slot_dead_ms;
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607

	memcpy(hb_block, slot->ds_raw_block, reg->hr_block_bytes);

	/* Is this correct? Do we assume that the node doesn't exist
	 * if we're not configured for him? */
	node = o2nm_get_node_by_num(slot->ds_node_num);
	if (!node)
		return 0;

	if (!o2hb_verify_crc(reg, hb_block)) {
		/* all paths from here will drop o2hb_live_lock for
		 * us. */
		spin_lock(&o2hb_live_lock);

		/* Don't print an error on the console in this case -
		 * a freshly formatted heartbeat area will not have a
		 * crc set on it. */
		if (list_empty(&slot->ds_live_item))
			goto out;

		/* The node is live but pushed out a bad crc. We
		 * consider it a transient miss but don't populate any
		 * other values as they may be junk. */
		mlog(ML_ERROR, "Node %d has written a bad crc to %s\n",
		     slot->ds_node_num, reg->hr_dev_name);
		o2hb_dump_slot(hb_block);

		slot->ds_equal_samples++;
		goto fire_callbacks;
	}

	/* we don't care if these wrap.. the state transitions below
	 * clear at the right places */
	cputime = le64_to_cpu(hb_block->hb_seq);
	if (slot->ds_last_time != cputime)
		slot->ds_changed_samples++;
	else
		slot->ds_equal_samples++;
	slot->ds_last_time = cputime;

	/* The node changed heartbeat generations. We assume this to
	 * mean it dropped off but came back before we timed out. We
	 * want to consider it down for the time being but don't want
	 * to lose any changed_samples state we might build up to
	 * considering it live again. */
	if (slot->ds_last_generation != le64_to_cpu(hb_block->hb_generation)) {
		gen_changed = 1;
		slot->ds_equal_samples = 0;
608
609
610
611
		mlog(ML_HEARTBEAT, "Node %d changed generation (0x%llx "
		     "to 0x%llx)\n", slot->ds_node_num,
		     (long long)slot->ds_last_generation,
		     (long long)le64_to_cpu(hb_block->hb_generation));
612
613
614
615
	}

	slot->ds_last_generation = le64_to_cpu(hb_block->hb_generation);

616
617
618
619
620
621
	mlog(ML_HEARTBEAT, "Slot %d gen 0x%llx cksum 0x%x "
	     "seq %llu last %llu changed %u equal %u\n",
	     slot->ds_node_num, (long long)slot->ds_last_generation,
	     le32_to_cpu(hb_block->hb_cksum),
	     (unsigned long long)le64_to_cpu(hb_block->hb_seq), 
	     (unsigned long long)slot->ds_last_time, slot->ds_changed_samples,
622
623
624
625
626
627
628
629
630
	     slot->ds_equal_samples);

	spin_lock(&o2hb_live_lock);

fire_callbacks:
	/* dead nodes only come to life after some number of
	 * changes at any time during their dead time */
	if (list_empty(&slot->ds_live_item) &&
	    slot->ds_changed_samples >= O2HB_LIVE_THRESHOLD) {
631
632
		mlog(ML_HEARTBEAT, "Node %d (id 0x%llx) joined my region\n",
		     slot->ds_node_num, (long long)slot->ds_last_generation);
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647

		/* first on the list generates a callback */
		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
			set_bit(slot->ds_node_num, o2hb_live_node_bitmap);

			o2hb_queue_node_event(&event, O2HB_NODE_UP_CB, node,
					      slot->ds_node_num);

			changed = 1;
		}

		list_add_tail(&slot->ds_live_item,
			      &o2hb_live_slots[slot->ds_node_num]);

		slot->ds_equal_samples = 0;
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664

		/* We want to be sure that all nodes agree on the
		 * number of milliseconds before a node will be
		 * considered dead. The self-fencing timeout is
		 * computed from this value, and a discrepancy might
		 * result in heartbeat calling a node dead when it
		 * hasn't self-fenced yet. */
		slot_dead_ms = le32_to_cpu(hb_block->hb_dead_ms);
		if (slot_dead_ms && slot_dead_ms != dead_ms) {
			/* TODO: Perhaps we can fail the region here. */
			mlog(ML_ERROR, "Node %d on device %s has a dead count "
			     "of %u ms, but our count is %u ms.\n"
			     "Please double check your configuration values "
			     "for 'O2CB_HEARTBEAT_THRESHOLD'\n",
			     slot->ds_node_num, reg->hr_dev_name, slot_dead_ms,
			     dead_ms);
		}
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
		goto out;
	}

	/* if the list is dead, we're done.. */
	if (list_empty(&slot->ds_live_item))
		goto out;

	/* live nodes only go dead after enough consequtive missed
	 * samples..  reset the missed counter whenever we see
	 * activity */
	if (slot->ds_equal_samples >= o2hb_dead_threshold || gen_changed) {
		mlog(ML_HEARTBEAT, "Node %d left my region\n",
		     slot->ds_node_num);

		/* last off the live_slot generates a callback */
		list_del_init(&slot->ds_live_item);
		if (list_empty(&o2hb_live_slots[slot->ds_node_num])) {
			clear_bit(slot->ds_node_num, o2hb_live_node_bitmap);

			o2hb_queue_node_event(&event, O2HB_NODE_DOWN_CB, node,
					      slot->ds_node_num);

			changed = 1;
		}

		/* We don't clear this because the node is still
		 * actually writing new blocks. */
		if (!gen_changed)
			slot->ds_changed_samples = 0;
		goto out;
	}
	if (slot->ds_changed_samples) {
		slot->ds_changed_samples = 0;
		slot->ds_equal_samples = 0;
	}
out:
	spin_unlock(&o2hb_live_lock);

	o2hb_run_event_list(&event);

	o2nm_node_put(node);
	return changed;
}

/* This could be faster if we just implmented a find_last_bit, but I
 * don't think the circumstances warrant it. */
static int o2hb_highest_node(unsigned long *nodes,
			     int numbits)
{
	int highest, node;

	highest = numbits;
	node = -1;
	while ((node = find_next_bit(nodes, numbits, node + 1)) != -1) {
		if (node >= numbits)
			break;

		highest = node;
	}

	return highest;
}

728
static int o2hb_do_disk_heartbeat(struct o2hb_region *reg)
729
730
731
732
733
{
	int i, ret, highest_node, change = 0;
	unsigned long configured_nodes[BITS_TO_LONGS(O2NM_MAX_NODES)];
	struct o2hb_bio_wait_ctxt write_wc;

734
735
736
737
738
739
	ret = o2nm_configured_node_map(configured_nodes,
				       sizeof(configured_nodes));
	if (ret) {
		mlog_errno(ret);
		return ret;
	}
740
741
742
743

	highest_node = o2hb_highest_node(configured_nodes, O2NM_MAX_NODES);
	if (highest_node >= O2NM_MAX_NODES) {
		mlog(ML_NOTICE, "ocfs2_heartbeat: no configured nodes found!\n");
744
		return -EINVAL;
745
746
747
748
749
750
751
752
753
	}

	/* No sense in reading the slots of nodes that don't exist
	 * yet. Of course, if the node definitions have holes in them
	 * then we're reading an empty slot anyway... Consider this
	 * best-effort. */
	ret = o2hb_read_slots(reg, highest_node + 1);
	if (ret < 0) {
		mlog_errno(ret);
754
		return ret;
755
756
757
758
759
760
761
762
763
764
765
766
767
768
	}

	/* With an up to date view of the slots, we can check that no
	 * other node has been improperly configured to heartbeat in
	 * our slot. */
	if (!o2hb_check_last_timestamp(reg))
		mlog(ML_ERROR, "Device \"%s\": another node is heartbeating "
		     "in our slot!\n", reg->hr_dev_name);

	/* fill in the proper info for our next heartbeat */
	o2hb_prepare_block(reg, reg->hr_generation);

	/* And fire off the write. Note that we don't wait on this I/O
	 * until later. */
769
	ret = o2hb_issue_node_write(reg, &write_wc);
770
771
	if (ret < 0) {
		mlog_errno(ret);
772
		return ret;
773
774
775
776
777
778
779
780
781
782
783
784
785
786
	}

	i = -1;
	while((i = find_next_bit(configured_nodes, O2NM_MAX_NODES, i + 1)) < O2NM_MAX_NODES) {

		change |= o2hb_check_slot(reg, &reg->hr_slots[i]);
	}

	/*
	 * We have to be sure we've advertised ourselves on disk
	 * before we can go to steady state.  This ensures that
	 * people we find in our steady state have seen us.
	 */
	o2hb_wait_on_io(reg, &write_wc);
787
788
789
790
791
792
793
794
795
	if (write_wc.wc_error) {
		/* Do not re-arm the write timeout on I/O error - we
		 * can't be sure that the new block ever made it to
		 * disk */
		mlog(ML_ERROR, "Write error %d on device \"%s\"\n",
		     write_wc.wc_error, reg->hr_dev_name);
		return write_wc.wc_error;
	}

796
797
798
799
800
801
802
	o2hb_arm_write_timeout(reg);

	/* let the person who launched us know when things are steady */
	if (!change && (atomic_read(&reg->hr_steady_iterations) != 0)) {
		if (atomic_dec_and_test(&reg->hr_steady_iterations))
			wake_up(&o2hb_steady_queue);
	}
803
804

	return 0;
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
}

/* Subtract b from a, storing the result in a. a *must* have a larger
 * value than b. */
static void o2hb_tv_subtract(struct timeval *a,
			     struct timeval *b)
{
	/* just return 0 when a is after b */
	if (a->tv_sec < b->tv_sec ||
	    (a->tv_sec == b->tv_sec && a->tv_usec < b->tv_usec)) {
		a->tv_sec = 0;
		a->tv_usec = 0;
		return;
	}

	a->tv_sec -= b->tv_sec;
	a->tv_usec -= b->tv_usec;
	while ( a->tv_usec < 0 ) {
		a->tv_sec--;
		a->tv_usec += 1000000;
	}
}

static unsigned int o2hb_elapsed_msecs(struct timeval *start,
				       struct timeval *end)
{
	struct timeval res = *end;

	o2hb_tv_subtract(&res, start);

	return res.tv_sec * 1000 + res.tv_usec / 1000;
}

/*
 * we ride the region ref that the region dir holds.  before the region
 * dir is removed and drops it ref it will wait to tear down this
 * thread.
 */
static int o2hb_thread(void *data)
{
	int i, ret;
	struct o2hb_region *reg = data;
	struct o2hb_bio_wait_ctxt write_wc;
	struct timeval before_hb, after_hb;
	unsigned int elapsed_msec;

	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread running\n");

	set_user_nice(current, -20);

	while (!kthread_should_stop() && !reg->hr_unclean_stop) {
		/* We track the time spent inside
		 * o2hb_do_disk_heartbeat so that we avoid more then
		 * hr_timeout_ms between disk writes. On busy systems
		 * this should result in a heartbeat which is less
		 * likely to time itself out. */
		do_gettimeofday(&before_hb);

863
864
865
866
		i = 0;
		do {
			ret = o2hb_do_disk_heartbeat(reg);
		} while (ret && ++i < 2);
867
868
869
870
871

		do_gettimeofday(&after_hb);
		elapsed_msec = o2hb_elapsed_msecs(&before_hb, &after_hb);

		mlog(0, "start = %lu.%lu, end = %lu.%lu, msec = %u\n",
872
873
874
		     before_hb.tv_sec, (unsigned long) before_hb.tv_usec,
		     after_hb.tv_sec, (unsigned long) after_hb.tv_usec,
		     elapsed_msec);
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895

		if (elapsed_msec < reg->hr_timeout_ms) {
			/* the kthread api has blocked signals for us so no
			 * need to record the return value. */
			msleep_interruptible(reg->hr_timeout_ms - elapsed_msec);
		}
	}

	o2hb_disarm_write_timeout(reg);

	/* unclean stop is only used in very bad situation */
	for(i = 0; !reg->hr_unclean_stop && i < reg->hr_blocks; i++)
		o2hb_shutdown_slot(&reg->hr_slots[i]);

	/* Explicit down notification - avoid forcing the other nodes
	 * to timeout on this region when we could just as easily
	 * write a clear generation - thus indicating to them that
	 * this node has left this region.
	 *
	 * XXX: Should we skip this on unclean_stop? */
	o2hb_prepare_block(reg, 0);
896
	ret = o2hb_issue_node_write(reg, &write_wc);
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
	if (ret == 0) {
		o2hb_wait_on_io(reg, &write_wc);
	} else {
		mlog_errno(ret);
	}

	mlog(ML_HEARTBEAT|ML_KTHREAD, "hb thread exiting\n");

	return 0;
}

void o2hb_init(void)
{
	int i;

	for (i = 0; i < ARRAY_SIZE(o2hb_callbacks); i++)
		INIT_LIST_HEAD(&o2hb_callbacks[i].list);

	for (i = 0; i < ARRAY_SIZE(o2hb_live_slots); i++)
		INIT_LIST_HEAD(&o2hb_live_slots[i]);

	INIT_LIST_HEAD(&o2hb_node_events);

	memset(o2hb_live_node_bitmap, 0, sizeof(o2hb_live_node_bitmap));
}

/* if we're already in a callback then we're already serialized by the sem */
static void o2hb_fill_node_map_from_callback(unsigned long *map,
					     unsigned bytes)
{
	BUG_ON(bytes < (BITS_TO_LONGS(O2NM_MAX_NODES) * sizeof(unsigned long)));

	memcpy(map, &o2hb_live_node_bitmap, bytes);
}

/*
 * get a map of all nodes that are heartbeating in any regions
 */
void o2hb_fill_node_map(unsigned long *map, unsigned bytes)
{
	/* callers want to serialize this map and callbacks so that they
	 * can trust that they don't miss nodes coming to the party */
	down_read(&o2hb_callback_sem);
	spin_lock(&o2hb_live_lock);
	o2hb_fill_node_map_from_callback(map, bytes);
	spin_unlock(&o2hb_live_lock);
	up_read(&o2hb_callback_sem);
}
EXPORT_SYMBOL_GPL(o2hb_fill_node_map);

/*
 * heartbeat configfs bits.  The heartbeat set is a default set under
 * the cluster set in nodemanager.c.
 */

static struct o2hb_region *to_o2hb_region(struct config_item *item)
{
	return item ? container_of(item, struct o2hb_region, hr_item) : NULL;
}

/* drop_item only drops its ref after killing the thread, nothing should
 * be using the region anymore.  this has to clean up any state that
 * attributes might have built up. */
static void o2hb_region_release(struct config_item *item)
{
	int i;
	struct page *page;
	struct o2hb_region *reg = to_o2hb_region(item);

	if (reg->hr_tmp_block)
		kfree(reg->hr_tmp_block);

	if (reg->hr_slot_data) {
		for (i = 0; i < reg->hr_num_pages; i++) {
			page = reg->hr_slot_data[i];
			if (page)
				__free_page(page);
		}
		kfree(reg->hr_slot_data);
	}

	if (reg->hr_bdev)
		blkdev_put(reg->hr_bdev);

	if (reg->hr_slots)
		kfree(reg->hr_slots);

	spin_lock(&o2hb_live_lock);
	list_del(&reg->hr_all_item);
	spin_unlock(&o2hb_live_lock);

	kfree(reg);
}

static int o2hb_read_block_input(struct o2hb_region *reg,
				 const char *page,
				 size_t count,
				 unsigned long *ret_bytes,
				 unsigned int *ret_bits)
{
	unsigned long bytes;
	char *p = (char *)page;

	bytes = simple_strtoul(p, &p, 0);