Commit 6c52a96e authored by David S. Miller's avatar David S. Miller
Browse files

[SPARC64]: Revamp Spitfire error trap handling.



Current uncorrectable error handling was poor enough
that the processor could just loop taking the same
trap over and over again.  Fix things up so that we
at least get a log message and perhaps even some register
state.

In the process, much consolidation became possible,
particularly with the correctable error handler.

Prefix assembler and C function names with "spitfire"
to indicate that these are for Ultra-I/II/IIi/IIe only.

More work is needed to make these routines robust and
featureful to the level of the Ultra-III error handlers.
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent bde4e4ee
......@@ -21,6 +21,7 @@
#include <asm/visasm.h>
#include <asm/estate.h>
#include <asm/auxio.h>
#include <asm/sfafsr.h>
#define curptr g6
......@@ -690,9 +691,159 @@ netbsd_syscall:
retl
nop
.globl __do_data_access_exception
.globl __do_data_access_exception_tl1
__do_data_access_exception_tl1:
/* We need to carefully read the error status, ACK
* the errors, prevent recursive traps, and pass the
* information on to C code for logging.
*
* We pass the AFAR in as-is, and we encode the status
* information as described in asm-sparc64/sfafsr.h
*/
.globl __spitfire_access_error
__spitfire_access_error:
/* Disable ESTATE error reporting so that we do not
* take recursive traps and RED state the processor.
*/
stxa %g0, [%g0] ASI_ESTATE_ERROR_EN
membar #Sync
mov UDBE_UE, %g1
ldxa [%g0] ASI_AFSR, %g4 ! Get AFSR
/* __spitfire_cee_trap branches here with AFSR in %g4 and
* UDBE_CE in %g1. It only clears ESTATE_ERR_CE in the
* ESTATE Error Enable register.
*/
__spitfire_cee_trap_continue:
ldxa [%g0] ASI_AFAR, %g5 ! Get AFAR
rdpr %tt, %g3
and %g3, 0x1ff, %g3 ! Paranoia
sllx %g3, SFSTAT_TRAP_TYPE_SHIFT, %g3
or %g4, %g3, %g4
rdpr %tl, %g3
cmp %g3, 1
mov 1, %g3
bleu %xcc, 1f
sllx %g3, SFSTAT_TL_GT_ONE_SHIFT, %g3
or %g4, %g3, %g4
/* Read in the UDB error register state, clearing the
* sticky error bits as-needed. We only clear them if
* the UE bit is set. Likewise, __spitfire_cee_trap
* below will only do so if the CE bit is set.
*
* NOTE: UltraSparc-I/II have high and low UDB error
* registers, corresponding to the two UDB units
* present on those chips. UltraSparc-IIi only
* has a single UDB, called "SDB" in the manual.
* For IIi the upper UDB register always reads
* as zero so for our purposes things will just
* work with the checks below.
*/
1: ldxa [%g0] ASI_UDBH_ERROR_R, %g3
and %g3, 0x3ff, %g7 ! Paranoia
sllx %g7, SFSTAT_UDBH_SHIFT, %g7
or %g4, %g7, %g4
andcc %g3, %g1, %g3 ! UDBE_UE or UDBE_CE
be,pn %xcc, 1f
nop
stxa %g3, [%g0] ASI_UDB_ERROR_W
membar #Sync
1: mov 0x18, %g3
ldxa [%g3] ASI_UDBL_ERROR_R, %g3
and %g3, 0x3ff, %g7 ! Paranoia
sllx %g7, SFSTAT_UDBL_SHIFT, %g7
or %g4, %g7, %g4
andcc %g3, %g1, %g3 ! UDBE_UE or UDBE_CE
be,pn %xcc, 1f
nop
mov 0x18, %g7
stxa %g3, [%g7] ASI_UDB_ERROR_W
membar #Sync
1: /* Ok, now that we've latched the error state,
* clear the sticky bits in the AFSR.
*/
stxa %g4, [%g0] ASI_AFSR
membar #Sync
rdpr %tl, %g2
cmp %g2, 1
rdpr %pil, %g2
bleu,pt %xcc, 1f
wrpr %g0, 15, %pil
ba,pt %xcc, etraptl1
rd %pc, %g7
ba,pt %xcc, 2f
nop
1: ba,pt %xcc, etrap_irq
rd %pc, %g7
2: mov %l4, %o1
mov %l5, %o2
call spitfire_access_error
add %sp, PTREGS_OFF, %o0
ba,pt %xcc, rtrap
clr %l6
/* This is the trap handler entry point for ECC correctable
* errors. They are corrected, but we listen for the trap
* so that the event can be logged.
*
* Disrupting errors are either:
* 1) single-bit ECC errors during UDB reads to system
* memory
* 2) data parity errors during write-back events
*
* As far as I can make out from the manual, the CEE trap
* is only for correctable errors during memory read
* accesses by the front-end of the processor.
*
* The code below is only for trap level 1 CEE events,
* as it is the only situation where we can safely record
* and log. For trap level >1 we just clear the CE bit
* in the AFSR and return.
*
* This is just like __spiftire_access_error above, but it
* specifically handles correctable errors. If an
* uncorrectable error is indicated in the AFSR we
* will branch directly above to __spitfire_access_error
* to handle it instead. Uncorrectable therefore takes
* priority over correctable, and the error logging
* C code will notice this case by inspecting the
* trap type.
*/
.globl __spitfire_cee_trap
__spitfire_cee_trap:
ldxa [%g0] ASI_AFSR, %g4 ! Get AFSR
mov 1, %g3
sllx %g3, SFAFSR_UE_SHIFT, %g3
andcc %g4, %g3, %g0 ! Check for UE
bne,pn %xcc, __spitfire_access_error
nop
/* Ok, in this case we only have a correctable error.
* Indicate we only wish to capture that state in register
* %g1, and we only disable CE error reporting unlike UE
* handling which disables all errors.
*/
ldxa [%g0] ASI_ESTATE_ERROR_EN, %g3
andn %g3, ESTATE_ERR_CE, %g3
stxa %g3, [%g0] ASI_ESTATE_ERROR_EN
membar #Sync
/* Preserve AFSR in %g4, indicate UDB state to capture in %g1 */
ba,pt %xcc, __spitfire_cee_trap_continue
mov UDBE_CE, %g1
.globl __spitfire_data_access_exception
.globl __spitfire_data_access_exception_tl1
__spitfire_data_access_exception_tl1:
rdpr %pstate, %g4
wrpr %g4, PSTATE_MG|PSTATE_AG, %pstate
mov TLB_SFSR, %g3
......@@ -714,12 +865,12 @@ __do_data_access_exception_tl1:
109: or %g7, %lo(109b), %g7
mov %l4, %o1
mov %l5, %o2
call data_access_exception_tl1
call spitfire_data_access_exception_tl1
add %sp, PTREGS_OFF, %o0
ba,pt %xcc, rtrap
clr %l6
__do_data_access_exception:
__spitfire_data_access_exception:
rdpr %pstate, %g4
wrpr %g4, PSTATE_MG|PSTATE_AG, %pstate
mov TLB_SFSR, %g3
......@@ -733,14 +884,14 @@ __do_data_access_exception:
109: or %g7, %lo(109b), %g7
mov %l4, %o1
mov %l5, %o2
call data_access_exception
call spitfire_data_access_exception
add %sp, PTREGS_OFF, %o0
ba,pt %xcc, rtrap
clr %l6
.globl __do_instruction_access_exception
.globl __do_instruction_access_exception_tl1
__do_instruction_access_exception_tl1:
.globl __spitfire_insn_access_exception
.globl __spitfire_insn_access_exception_tl1
__spitfire_insn_access_exception_tl1:
rdpr %pstate, %g4
wrpr %g4, PSTATE_MG|PSTATE_AG, %pstate
mov TLB_SFSR, %g3
......@@ -753,12 +904,12 @@ __do_instruction_access_exception_tl1:
109: or %g7, %lo(109b), %g7
mov %l4, %o1
mov %l5, %o2
call instruction_access_exception_tl1
call spitfire_insn_access_exception_tl1
add %sp, PTREGS_OFF, %o0
ba,pt %xcc, rtrap
clr %l6
__do_instruction_access_exception:
__spitfire_insn_access_exception:
rdpr %pstate, %g4
wrpr %g4, PSTATE_MG|PSTATE_AG, %pstate
mov TLB_SFSR, %g3
......@@ -771,102 +922,11 @@ __do_instruction_access_exception:
109: or %g7, %lo(109b), %g7
mov %l4, %o1
mov %l5, %o2
call instruction_access_exception
call spitfire_insn_access_exception
add %sp, PTREGS_OFF, %o0
ba,pt %xcc, rtrap
clr %l6
/* This is the trap handler entry point for ECC correctable
* errors. They are corrected, but we listen for the trap
* so that the event can be logged.
*
* Disrupting errors are either:
* 1) single-bit ECC errors during UDB reads to system
* memory
* 2) data parity errors during write-back events
*
* As far as I can make out from the manual, the CEE trap
* is only for correctable errors during memory read
* accesses by the front-end of the processor.
*
* The code below is only for trap level 1 CEE events,
* as it is the only situation where we can safely record
* and log. For trap level >1 we just clear the CE bit
* in the AFSR and return.
*/
/* Our trap handling infrastructure allows us to preserve
* two 64-bit values during etrap for arguments to
* subsequent C code. Therefore we encode the information
* as follows:
*
* value 1) Full 64-bits of AFAR
* value 2) Low 33-bits of AFSR, then bits 33-->42
* are UDBL error status and bits 43-->52
* are UDBH error status
*/
.align 64
.globl cee_trap
cee_trap:
ldxa [%g0] ASI_AFSR, %g1 ! Read AFSR
ldxa [%g0] ASI_AFAR, %g2 ! Read AFAR
sllx %g1, 31, %g1 ! Clear reserved bits
srlx %g1, 31, %g1 ! in AFSR
/* NOTE: UltraSparc-I/II have high and low UDB error
* registers, corresponding to the two UDB units
* present on those chips. UltraSparc-IIi only
* has a single UDB, called "SDB" in the manual.
* For IIi the upper UDB register always reads
* as zero so for our purposes things will just
* work with the checks below.
*/
ldxa [%g0] ASI_UDBL_ERROR_R, %g3 ! Read UDB-Low error status
andcc %g3, (1 << 8), %g4 ! Check CE bit
sllx %g3, (64 - 10), %g3 ! Clear reserved bits
srlx %g3, (64 - 10), %g3 ! in UDB-Low error status
sllx %g3, (33 + 0), %g3 ! Shift up to encoding area
or %g1, %g3, %g1 ! Or it in
be,pn %xcc, 1f ! Branch if CE bit was clear
nop
stxa %g4, [%g0] ASI_UDB_ERROR_W ! Clear CE sticky bit in UDBL
membar #Sync ! Synchronize ASI stores
1: mov 0x18, %g5 ! Addr of UDB-High error status
ldxa [%g5] ASI_UDBH_ERROR_R, %g3 ! Read it
andcc %g3, (1 << 8), %g4 ! Check CE bit
sllx %g3, (64 - 10), %g3 ! Clear reserved bits
srlx %g3, (64 - 10), %g3 ! in UDB-High error status
sllx %g3, (33 + 10), %g3 ! Shift up to encoding area
or %g1, %g3, %g1 ! Or it in
be,pn %xcc, 1f ! Branch if CE bit was clear
nop
nop
stxa %g4, [%g5] ASI_UDB_ERROR_W ! Clear CE sticky bit in UDBH
membar #Sync ! Synchronize ASI stores
1: mov 1, %g5 ! AFSR CE bit is
sllx %g5, 20, %g5 ! bit 20
stxa %g5, [%g0] ASI_AFSR ! Clear CE sticky bit in AFSR
membar #Sync ! Synchronize ASI stores
sllx %g2, (64 - 41), %g2 ! Clear reserved bits
srlx %g2, (64 - 41), %g2 ! in latched AFAR
andn %g2, 0x0f, %g2 ! Finish resv bit clearing
mov %g1, %g4 ! Move AFSR+UDB* into save reg
mov %g2, %g5 ! Move AFAR into save reg
rdpr %pil, %g2
wrpr %g0, 15, %pil
ba,pt %xcc, etrap_irq
rd %pc, %g7
mov %l4, %o0
mov %l5, %o1
call cee_log
add %sp, PTREGS_OFF, %o2
ba,a,pt %xcc, rtrap_irq
/* Capture I/D/E-cache state into per-cpu error scoreboard.
*
* %g1: (TL>=0) ? 1 : 0
......
......@@ -33,6 +33,7 @@
#include <asm/dcu.h>
#include <asm/estate.h>
#include <asm/chafsr.h>
#include <asm/sfafsr.h>
#include <asm/psrcompat.h>
#include <asm/processor.h>
#include <asm/timer.h>
......@@ -143,8 +144,7 @@ void do_BUG(const char *file, int line)
}
#endif
void instruction_access_exception(struct pt_regs *regs,
unsigned long sfsr, unsigned long sfar)
void spitfire_insn_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
{
siginfo_t info;
......@@ -153,8 +153,8 @@ void instruction_access_exception(struct pt_regs *regs,
return;
if (regs->tstate & TSTATE_PRIV) {
printk("instruction_access_exception: SFSR[%016lx] SFAR[%016lx], going.\n",
sfsr, sfar);
printk("spitfire_insn_access_exception: SFSR[%016lx] "
"SFAR[%016lx], going.\n", sfsr, sfar);
die_if_kernel("Iax", regs);
}
if (test_thread_flag(TIF_32BIT)) {
......@@ -169,19 +169,17 @@ void instruction_access_exception(struct pt_regs *regs,
force_sig_info(SIGSEGV, &info, current);
}
void instruction_access_exception_tl1(struct pt_regs *regs,
unsigned long sfsr, unsigned long sfar)
void spitfire_insn_access_exception_tl1(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
{
if (notify_die(DIE_TRAP_TL1, "instruction access exception tl1", regs,
0, 0x8, SIGTRAP) == NOTIFY_STOP)
return;
dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
instruction_access_exception(regs, sfsr, sfar);
spitfire_insn_access_exception(regs, sfsr, sfar);
}
void data_access_exception(struct pt_regs *regs,
unsigned long sfsr, unsigned long sfar)
void spitfire_data_access_exception(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
{
siginfo_t info;
......@@ -207,8 +205,8 @@ void data_access_exception(struct pt_regs *regs,
return;
}
/* Shit... */
printk("data_access_exception: SFSR[%016lx] SFAR[%016lx], going.\n",
sfsr, sfar);
printk("spitfire_data_access_exception: SFSR[%016lx] "
"SFAR[%016lx], going.\n", sfsr, sfar);
die_if_kernel("Dax", regs);
}
......@@ -220,15 +218,14 @@ void data_access_exception(struct pt_regs *regs,
force_sig_info(SIGSEGV, &info, current);
}
void data_access_exception_tl1(struct pt_regs *regs,
unsigned long sfsr, unsigned long sfar)
void spitfire_data_access_exception_tl1(struct pt_regs *regs, unsigned long sfsr, unsigned long sfar)
{
if (notify_die(DIE_TRAP_TL1, "data access exception tl1", regs,
0, 0x30, SIGTRAP) == NOTIFY_STOP)
return;
dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
data_access_exception(regs, sfsr, sfar);
spitfire_data_access_exception(regs, sfsr, sfar);
}
#ifdef CONFIG_PCI
......@@ -264,54 +261,13 @@ static void spitfire_clean_and_reenable_l1_caches(void)
: "memory");
}
void do_iae(struct pt_regs *regs)
static void spitfire_enable_estate_errors(void)
{
siginfo_t info;
spitfire_clean_and_reenable_l1_caches();
if (notify_die(DIE_TRAP, "instruction access exception", regs,
0, 0x8, SIGTRAP) == NOTIFY_STOP)
return;
info.si_signo = SIGBUS;
info.si_errno = 0;
info.si_code = BUS_OBJERR;
info.si_addr = (void *)0;
info.si_trapno = 0;
force_sig_info(SIGBUS, &info, current);
}
void do_dae(struct pt_regs *regs)
{
siginfo_t info;
#ifdef CONFIG_PCI
if (pci_poke_in_progress && pci_poke_cpu == smp_processor_id()) {
spitfire_clean_and_reenable_l1_caches();
pci_poke_faulted = 1;
/* Why the fuck did they have to change this? */
if (tlb_type == cheetah || tlb_type == cheetah_plus)
regs->tpc += 4;
regs->tnpc = regs->tpc + 4;
return;
}
#endif
spitfire_clean_and_reenable_l1_caches();
if (notify_die(DIE_TRAP, "data access exception", regs,
0, 0x30, SIGTRAP) == NOTIFY_STOP)
return;
info.si_signo = SIGBUS;
info.si_errno = 0;
info.si_code = BUS_OBJERR;
info.si_addr = (void *)0;
info.si_trapno = 0;
force_sig_info(SIGBUS, &info, current);
__asm__ __volatile__("stxa %0, [%%g0] %1\n\t"
"membar #Sync"
: /* no outputs */
: "r" (ESTATE_ERR_ALL),
"i" (ASI_ESTATE_ERROR_EN));
}
static char ecc_syndrome_table[] = {
......@@ -349,65 +305,15 @@ static char ecc_syndrome_table[] = {
0x0b, 0x48, 0x48, 0x4b, 0x48, 0x4b, 0x4b, 0x4a
};
/* cee_trap in entry.S encodes AFSR/UDBH/UDBL error status
* in the following format. The AFAR is left as is, with
* reserved bits cleared, and is a raw 40-bit physical
* address.
*/
#define CE_STATUS_UDBH_UE (1UL << (43 + 9))
#define CE_STATUS_UDBH_CE (1UL << (43 + 8))
#define CE_STATUS_UDBH_ESYNDR (0xffUL << 43)
#define CE_STATUS_UDBH_SHIFT 43
#define CE_STATUS_UDBL_UE (1UL << (33 + 9))
#define CE_STATUS_UDBL_CE (1UL << (33 + 8))
#define CE_STATUS_UDBL_ESYNDR (0xffUL << 33)
#define CE_STATUS_UDBL_SHIFT 33
#define CE_STATUS_AFSR_MASK (0x1ffffffffUL)
#define CE_STATUS_AFSR_ME (1UL << 32)
#define CE_STATUS_AFSR_PRIV (1UL << 31)
#define CE_STATUS_AFSR_ISAP (1UL << 30)
#define CE_STATUS_AFSR_ETP (1UL << 29)
#define CE_STATUS_AFSR_IVUE (1UL << 28)
#define CE_STATUS_AFSR_TO (1UL << 27)
#define CE_STATUS_AFSR_BERR (1UL << 26)
#define CE_STATUS_AFSR_LDP (1UL << 25)
#define CE_STATUS_AFSR_CP (1UL << 24)
#define CE_STATUS_AFSR_WP (1UL << 23)
#define CE_STATUS_AFSR_EDP (1UL << 22)
#define CE_STATUS_AFSR_UE (1UL << 21)
#define CE_STATUS_AFSR_CE (1UL << 20)
#define CE_STATUS_AFSR_ETS (0xfUL << 16)
#define CE_STATUS_AFSR_ETS_SHIFT 16
#define CE_STATUS_AFSR_PSYND (0xffffUL << 0)
#define CE_STATUS_AFSR_PSYND_SHIFT 0
/* Layout of Ecache TAG Parity Syndrome of AFSR */
#define AFSR_ETSYNDROME_7_0 0x1UL /* E$-tag bus bits <7:0> */
#define AFSR_ETSYNDROME_15_8 0x2UL /* E$-tag bus bits <15:8> */
#define AFSR_ETSYNDROME_21_16 0x4UL /* E$-tag bus bits <21:16> */
#define AFSR_ETSYNDROME_24_22 0x8UL /* E$-tag bus bits <24:22> */
static char *syndrome_unknown = "<Unknown>";
asmlinkage void cee_log(unsigned long ce_status,
unsigned long afar,
struct pt_regs *regs)
static void spitfire_log_udb_syndrome(unsigned long afar, unsigned long udbh, unsigned long udbl, unsigned long bit)
{
char memmod_str[64];
char *p;
unsigned short scode, udb_reg;
unsigned short scode;
char memmod_str[64], *p;
printk(KERN_WARNING "CPU[%d]: Correctable ECC Error "
"AFSR[%lx] AFAR[%016lx] UDBL[%lx] UDBH[%lx]\n",
smp_processor_id(),
(ce_status & CE_STATUS_AFSR_MASK),
afar,
((ce_status >> CE_STATUS_UDBL_SHIFT) & 0x3ffUL),
((ce_status >> CE_STATUS_UDBH_SHIFT) & 0x3ffUL));
udb_reg = ((ce_status >> CE_STATUS_UDBL_SHIFT) & 0x3ffUL);
if (udb_reg & (1 << 8)) {
scode = ecc_syndrome_table[udb_reg & 0xff];
if (udbl & bit) {
scode = ecc_syndrome_table[udbl & 0xff];
if (prom_getunumber(scode, afar,
memmod_str, sizeof(memmod_str)) == -1)
p = syndrome_unknown;
......@@ -418,9 +324,8 @@ asmlinkage void cee_log(unsigned long ce_status,
smp_processor_id(), scode, p);
}
udb_reg = ((ce_status >> CE_STATUS_UDBH_SHIFT) & 0x3ffUL);
if (udb_reg & (1 << 8)) {
scode = ecc_syndrome_table[udb_reg & 0xff];
if (udbh & bit) {
scode = ecc_syndrome_table[udbh & 0xff];
if (prom_getunumber(scode, afar,
memmod_str, sizeof(memmod_str)) == -1)
p = syndrome_unknown;
......@@ -430,6 +335,127 @@ asmlinkage void cee_log(unsigned long ce_status,
"Memory Module \"%s\"\n",
smp_processor_id(), scode, p);
}
}
static void spitfire_cee_log(unsigned long afsr, unsigned long afar, unsigned long udbh, unsigned long udbl, int tl1, struct pt_regs *regs)
{
printk(KERN_WARNING "CPU[%d]: Correctable ECC Error "
"AFSR[%lx] AFAR[%016lx] UDBL[%lx] UDBH[%lx] TL>1[%d]\n",
smp_processor_id(), afsr, afar, udbl, udbh, tl1);
spitfire_log_udb_syndrome(afar, udbh, udbl, UDBE_CE);
/* We always log it, even if someone is listening for this
* trap.
*/
notify_die(DIE_TRAP, "Correctable ECC Error", regs,
0, TRAP_TYPE_CEE, SIGTRAP);
/* The Correctable ECC Error trap does not disable I/D caches. So
* we only have to restore the ESTATE Error Enable register.
*/
spitfire_enable_estate_errors();
}
static void spitfire_ue_log(unsigned long afsr, unsigned long afar, unsigned long udbh, unsigned long udbl, unsigned long tt, int tl1, struct pt_regs *regs)
{
siginfo_t info;
printk(KERN_WARNING "CPU[%d]: Uncorrectable Error AFSR[%lx] "
"AFAR[%lx] UDBL[%lx] UDBH[%ld] TT[%lx] TL>1[%d]\n",
smp_processor_id(), afsr, afar, udbl, udbh, tt, tl1);
/* XXX add more human friendly logging of the error status
* XXX as is implemented for cheetah
*/
spitfire_log_udb_syndrome(afar, udbh, udbl, UDBE_UE);
/* We always log it, even if someone is listening for this
* trap.
*/
notify_die(DIE_TRAP, "Uncorrectable Error", regs,
0, tt, SIGTRAP);
if (regs->tstate & TSTATE_PRIV) {
if (tl1)
dump_tl1_traplog((struct tl1_traplog *)(regs + 1));
die_if_kernel("UE", regs);
}
/* XXX need more intelligent processing here, such as is implemented
* XXX for cheetah errors, in fact if the E-cache still holds the
* XXX line with bad parity this will loop
*/
spitfire_clean_and_reenable_l1_caches();
spitfire_enable_estate_errors();
if (test_thread_flag(TIF_32BIT)) {
regs->tpc &= 0xffffffff;
regs->tnpc &= 0xffffffff;
}