/*********************************************************************
 *                
 * Copyright (C) 2006-2008, 2010,  Karlsruhe University
 *                
 * File path:     platform/pc99/perfmon.h
 * Description:   Performance monitoring counter macros for IA32/AMD64 CPUS.
 *                
 * @LICENSE@
 *                
 * $Id: perfmon.h,v 1.1 2006/09/26 10:41:15 stoess Exp $
 *                
 ********************************************************************/

#ifndef __PLATFORM__PC99__PERFMON_H__
#define __PLATFORM__PC99__PERFMON_H__

#include INC_ARCH(cpu.h)

/*********************************************************************
 * Pentium 3 processors
 *********************************************************************/

#if defined(CONFIG_CPU_X86_I686) 

#define X86_MSR_PMC_EVTSEL0		       0x186      /* Performance EVT0 */
#define X86_MSR_PMC_EVTSEL1		       0x187      /* Performance EVT1 */

#define X86_MSR_PMC_CTR0		       0xc1      /* Performance CTR0 */
#define X86_MSR_PMC_CTR1		       0xc2      /* Performance CTR1 */

/*********************************************************************
 * Athlon and Opteron processors
 *********************************************************************/

#elif defined(CONFIG_CPU_X86_K8) 

#define X86_MSR_PMC_EVTSEL0		       0xC0010000      /* Performance EVT0 */
#define X86_MSR_PMC_EVTSEL1		       0xC0010001      /* Performance EVT1 */
#define X86_MSR_PMC_EVTSEL2		       0xC0010002      /* Performance EVT2 */
#define X86_MSR_PMC_EVTSEL3		       0xC0010003      /* Performance EVT3 */

#define X86_MSR_PMC_CTR0		       0xC0010004      /* Performance CTR0 */
#define X86_MSR_PMC_CTR1		       0xC0010005      /* Performance CTR1 */
#define X86_MSR_PMC_CTR2		       0xC0010006      /* Performance CTR2 */
#define X86_MSR_PMC_CTR3		       0xC0010007      /* Performance CTR3 */

/*********************************************************************
 * P4, Pentium D and Xeon processors
 *********************************************************************/

#elif defined(CONFIG_CPU_X86_P4)

#define X86_MSR_PMC_BASE			0x300
#define X86_MSR_PMC_CTR_NO(addr)		(addr - X86_X64_MRS_PMC_BASE) 

#define X86_MSR_PMC_BPU_COUNTER(x)		(0x300 + x) 
#define X86_MSR_PMC_MS_COUNTER(x)		(0x304 + x) 
#define X86_MSR_PMC_FLAME_COUNTER(x)		(0x308 + x) 
#define X86_MSR_PMC_IQ_COUNTER(x)		(0x30C + x) 

#define X86_MSR_PMC_BPU_CCCR(x)			(0x360 + x) 
#define X86_MSR_PMC_MS_CCCR(x)		        (0x364 + x) 
#define X86_MSR_PMC_FLAME_CCCR(x)		(0x368 + x) 
#define X86_MSR_PMC_IQ_CCCR(x)			(0x36C + x) 

#define X86_MSR_PMC_BSU_ESCR(x)    		(0x3A0 + x) 
#define X86_MSR_PMC_FSB_ESCR(x)    		(0x3A2 + x)
#define X86_MSR_PMC_FIRM_ESCR(x)   		(0x3A4 + x) 
#define X86_MSR_PMC_FLAME_ESCR(x)  		(0x3A6 + x) 
#define X86_MSR_PMC_DAC_ESCR(x)    		(0x3A8 + x) 
#define X86_MSR_PMC_MOB_ESCR(x)    		(0x3AA + x) 
#define X86_MSR_PMC_PMH_ESCR(x)    		(0x3AC + x) 
#define X86_MSR_PMC_SAAT_ESCR(x)   		(0x3AE + x) 
#define X86_MSR_PMC_U2L_ESCR(x)    		(0x3B0 + x) 
#define X86_MSR_PMC_BPU_ESCR(x)    		(0x3B2 + x) 
#define X86_MSR_PMC_IS_ESCR(x)   		(0x3B4 + x)
#define X86_MSR_PMC_ITLB_ESCR(x)    		(0x3B6 + x)
#define X86_MSR_PMC_IQ_ESCR(x)			(0x3BA + x)
#define X86_MSR_PMC_RAT_ESCR(x) 	  	(0x3BC + x)
#define X86_MSR_PMC_SSU_ESCR(x) 	  	(0x3BE + x)
#define X86_MSR_PMC_MS_ESCR(x)			(0x3C0 + x)
#define X86_MSR_PMC_TBPU_ESCR(x)    		(0x3C2 + x)
#define X86_MSR_PMC_TC_ESCR(x)			(0x3C4 + x)
#define X86_MSR_PMC_IX_ESCR(x)			(0x3C8 + x)
#define X86_MSR_PMC_ALF_ESCR(x) 		(0x3CA + x)

#define X86_MSR_PMC_CRU_ESCR0			0x3B8
#define X86_MSR_PMC_CRU_ESCR1			0x3B9
#define X86_MSR_PMC_CRU_ESCR2			0x3CC
#define X86_MSR_PMC_CRU_ESCR3			0x3CD
#define X86_MSR_PMC_CRU_ESCR4			0x3E0
#define X86_MSR_PMC_CRU_ESCR5			0x3E1
#define X86_MSR_PMC_TC_PRECISE_EVENT		0x3F0

#define X86_MSR_PMC_BPU_CTR_BSU_ESCR		7
#define X86_MSR_PMC_BPU_CTR_FSB_ESCR		6
#define X86_MSR_PMC_BPU_CTR_MOB_ESCR		2
#define X86_MSR_PMC_BPU_CTR_PMH_ESCR		4
#define X86_MSR_PMC_BPU_CTR_BPU_ESCR		0
#define X86_MSR_PMC_BPU_CTR_IS_ESCR		1
#define X86_MSR_PMC_BPU_CTR_ITLB_ESCR		3
#define X86_MSR_PMC_BPU_CTR_IX_ESCR		5

#define X86_MSR_PMC_MS_CTR_MS_ESCR		0
#define X86_MSR_PMC_MS_CTR_TBPU_ESCR		2
#define X86_MSR_PMC_MS_CTR_TC_ESCR		1

#define X86_MSR_PMC_FLAME_CTR_FIRM_ESCR		1
#define X86_MSR_PMC_FLAME_CTR_FLAME_ESCR	0
#define X86_MSR_PMC_FLAME_CTR_DAC_ESCR		5
#define X86_MSR_PMC_FLAME_CTR_SAAT_ESCR		2
#define X86_MSR_PMC_FLAME_CTR_U2L_ESCR		3

#define X86_MSR_PMC_IQ_CTR_CRU_ESCR01	4
#define X86_MSR_PMC_IQ_CTR_CRU_ESCR23	5
#define X86_MSR_PMC_IQ_CTR_CRU_ESCR45	6
#define X86_MSR_PMC_IQ_CTR_IQ_ESCR01	0
#define X86_MSR_PMC_IQ_CTR_RAT_ESCR01	2
#define X86_MSR_PMC_IQ_CTR_SSU_ESCR01	3
#define X86_MSR_PMC_IQ_CTR_ALF_ESCR01	1

#endif /* defined(CONFIG_CPU_X86_I686) */


INLINE void setup_perfmon_cpu(word_t cpuid)
{

#if defined(CONFIG_CPU_X86_I686) || defined(CONFIG_CPU_X86_K8)

    /* disable PerfEvents */
    x86_wrmsr(X86_MSR_PMC_EVTSEL0, 0);
    x86_wrmsr(X86_MSR_PMC_EVTSEL1, 0);
 
    /* clear PMCs */
    x86_wrmsr(X86_MSR_PMC_CTR0, 0);
    x86_wrmsr(X86_MSR_PMC_CTR1, 0);
    
     /* init PMCs */
     x86_wrmsr(X86_MSR_PMC_EVTSEL0, 0x4100C0);  // ENABLE + USER + INST_RETIRED
     x86_wrmsr(X86_MSR_PMC_EVTSEL1, 0x4200C0);  // ENABLE + KRNL + INST_RETIRED


#elif defined(CONFIG_CPU_X86_P4)
#if defined(CONFIG_TBUF_PERFMON_ENERGY) || defined(CONFIG_X_EVT_LOGGING)
    u64_t val;

    // reset performance counters
    for (word_t addr=X86_MSR_PMC_BPU_CCCR(0); addr <= X86_MSR_PMC_IQ_CCCR(5); ++addr)
	x86_wrmsr(addr, 0x30000);

    for (word_t addr=X86_MSR_PMC_BPU_COUNTER(0); addr <= X86_MSR_PMC_IQ_COUNTER(5); ++addr)
	x86_wrmsr(addr, 0);

    // Configure ESCRs

    val = ((u64_t)0x0 << 32) | 0xFC00;
    x86_wrmsr(X86_MSR_PMC_TC_PRECISE_EVENT, val);

    // Enable Precise Event Based Sampling (accurate & low sampling overhead)
    val = ((u64_t)0x0 << 32) | 0x1000001;
    x86_wrmsr(X86_MSR_PEBS_ENABLE, val);

    // Also enabling PEBS
    val = ((u64_t)0x0 << 32) | 0x1;
    x86_wrmsr(X86_MSR_PEBS_MATRIX_VERT, val);

    // Count unhalted cycles
    val = ((u64_t)0x0 << 32) | 0x2600020C;
    x86_wrmsr(X86_MSR_PMC_FSB_ESCR(0), val);

    // Count load uops that are replayed due to unaligned addresses
    // and/or partial data in the Memory Order Valfer (MOB)
    val = ((u64_t)0x0 << 32) | 0x600740C;
    x86_wrmsr(X86_MSR_PMC_MOB_ESCR(0), val);

    // Count op queue writes
    val = ((u64_t)0x0 << 32) | 0x12000E0C;
    x86_wrmsr(X86_MSR_PMC_MS_ESCR(0), val);

    // Count retired branches
    val = ((u64_t)0x0 << 32) | 0x8003C0C;
    x86_wrmsr(X86_MSR_PMC_TBPU_ESCR(0), val);

    // Count x87_FP_uop 
    val = ((u64_t)0x0 << 32) | 0x900000C;
    x86_wrmsr(X86_MSR_PMC_FIRM_ESCR(0), val);

    // Count mispredicted
    val = ((u64_t)0x0 << 32) | 0x600020C;
    x86_wrmsr(X86_MSR_PMC_CRU_ESCR0, val);

    // Count memory retired
    val = ((u64_t)0x0 << 32) | 0x1000020C;
    x86_wrmsr(X86_MSR_PMC_CRU_ESCR2, val);

    // Count load miss level 1 data cache
    val = ((u64_t)0x0 << 32) | 0x1200020C;
    x86_wrmsr(X86_MSR_PMC_CRU_ESCR3, val);

    // uop type
    val = ((u64_t)0x0 << 32) | 0x4000C0C;
    x86_wrmsr(X86_MSR_PMC_RAT_ESCR(0), val);
    // Configure CCCRs

    // Store unhalted cycles
    val = ((u64_t)0x0 << 32) | 0x3D000;
    x86_wrmsr(X86_MSR_PMC_BPU_CCCR(0), val);

    // Store MOB load replay
    val = ((u64_t)0x0 << 32) | 0x35000;
    x86_wrmsr(X86_MSR_PMC_BPU_CCCR(1), val);

    // Store op queue writes
    val = ((u64_t)0x0 << 32) | 0x31000;
    x86_wrmsr(X86_MSR_PMC_MS_CCCR(0), val);

    // Store retired branches
    val = ((u64_t)0x0 << 32) | 0x35000;
    x86_wrmsr(X86_MSR_PMC_MS_CCCR(1), val);

    // Store x87_FP_uop
    val = ((u64_t)0x0 << 32) | 0x33000;
    x86_wrmsr(X86_MSR_PMC_FLAME_CCCR(0), val);

    // Store mispredicted branches
    val = ((u64_t)0x0 << 32) | 0x39000;
    x86_wrmsr(X86_MSR_PMC_IQ_CCCR(0), val);

    // Store memory retired
    val = ((u64_t)0x0 << 32) | 0x3B000;
    x86_wrmsr(X86_MSR_PMC_IQ_CCCR(1), val);

    // Store load miss level 1 data cache
    val = ((u64_t)0x0 << 32) | 0x3B000;
    x86_wrmsr(X86_MSR_PMC_IQ_CCCR(2), val);

    // Store uop type
    val = ((u64_t)0x0 << 32) | 0x35000;
    x86_wrmsr(X86_MSR_PMC_IQ_CCCR(4), val);

    // Setup complete

#else
     /* disable PMCs via CCCR*/
     x86_wrmsr(X86_MSR_PMC_IQ_CCCR(0), 3 << 16);
     x86_wrmsr(X86_MSR_PMC_IQ_CCCR(2), 3 << 16);
     
     /* clear PMCs */
     x86_wrmsr(X86_MSR_PMC_IQ_COUNTER(0), 0);
     x86_wrmsr(X86_MSR_PMC_IQ_COUNTER(2), 0);

     /* 
      * init ESCR0:
      * user
      * event mask (non-bogus tagged and non-tagged) 
      * event select (inst_retired)
      */
     x86_wrmsr(X86_MSR_PMC_CRU_ESCR0, (1 << 2) | (3 << 9) | (2 << 25)); 
     
     /* 
      * init ESCR1:
      * kernel
      * event mask (non-bogus tagged and non-tagged) 
      * event select (inst_retired)
      */
     x86_wrmsr(X86_MSR_PMC_CRU_ESCR1, (1 << 3) | (3 << 9) | (2 << 25)); 

     /* 
      * enable PMCs via CCCR:
      * enable + escr select + reserved
      */
     x86_wrmsr(X86_MSR_PMC_IQ_CCCR(0), (1 << 12) | (X86_MSR_PMC_IQ_CTR_CRU_ESCR01 << 13) | (3 << 16)); 
     x86_wrmsr(X86_MSR_PMC_IQ_CCCR(2), (1 << 12) | (X86_MSR_PMC_IQ_CTR_CRU_ESCR01 << 13) | (3 << 16));

#endif /* CONFIG_TBUF_PERFMON_ENERGY */
#endif /* CONFIG_CPU_X86_P4 */
 
}




#endif /* !__PLATFORM__PC99__PERFMON_H__ */