#include <stdio.h>
#include <string.h>
#include "cache.h"
#include "logging.h"
#include "rpi-base.h"
#include "startup.h"
#include "defs.h"

// Historical Note:
// Were seeing core 3 crashes if inner *and* outer both set to some flavour of WB (i.e. 1 or 3)
// The point of crashing is when the data cache is enabled
// At that point, the stack appears to vanish and the data read back is 0x55555555
// Reason turned out to be failure to correctly invalidate the entire data cache

const static unsigned l1_cached_threshold = L2_CACHED_MEM_BASE >> 20;
const static unsigned l2_cached_threshold = UNCACHED_MEM_BASE >> 20;

volatile __attribute__ ((aligned (0x4000))) unsigned int PageTable[4096];
volatile __attribute__ ((aligned (0x4000))) unsigned int PageTable2[NUM_4K_PAGES];

const static int aa = 1;
const static int bb = 1;
const static int shareable = 1;

#define SETWAY_LEVEL_SHIFT          1

// 4 ways x 128 sets x 64 bytes per line 32KB
#define L1_DATA_CACHE_SETS        128
#define L1_DATA_CACHE_WAYS          4
#define L1_SETWAY_WAY_SHIFT        30   // 32-Log2(L1_DATA_CACHE_WAYS)
#define L1_SETWAY_SET_SHIFT         6   // Log2(L1_DATA_CACHE_LINE_LENGTH)

// 8 ways x 1024 sets x 64 bytes per line = 512KB
#define PI2_L2_CACHE_SETS            1024
#define PI2_L2_CACHE_WAYS               8
#define PI2_L2_SETWAY_WAY_SHIFT        29   // 32-Log2(L2_CACHE_WAYS)

// 16 ways x 512 sets x 64 bytes per line = 512KB
#define PI3_L2_CACHE_SETS             512
#define PI3_L2_CACHE_WAYS              16
#define PI3_L2_SETWAY_WAY_SHIFT        28   // 32-Log2(L2_CACHE_WAYS)

#define L2_SETWAY_SET_SHIFT         6   // Log2(L2_CACHE_LINE_LENGTH)

// The origin of this function is:
// https://github.com/rsta2/uspi/blob/master/env/lib/synchronize.c

void InvalidateDataCache (void)
{
   unsigned nSet;
   unsigned nWay;
   uint32_t nSetWayLevel;


       // invalidate L1 data cache
       for (nSet = 0; nSet < L1_DATA_CACHE_SETS; nSet++) {
          for (nWay = 0; nWay < L1_DATA_CACHE_WAYS; nWay++) {
             nSetWayLevel = nWay << L1_SETWAY_WAY_SHIFT
                | nSet << L1_SETWAY_SET_SHIFT
                | 0 << SETWAY_LEVEL_SHIFT;
             asm volatile ("mcr p15, 0, %0, c7, c6,  2" : : "r" (nSetWayLevel) : "memory");   // DCISW
          }
       }

   if (_get_hardware_id() == _RPI2) { //Raspberry PI 2

       // invalidate L2 unified cache
       for (nSet = 0; nSet < PI2_L2_CACHE_SETS; nSet++) {
          for (nWay = 0; nWay < PI2_L2_CACHE_WAYS; nWay++) {
             nSetWayLevel = nWay << PI2_L2_SETWAY_WAY_SHIFT
                | nSet << L2_SETWAY_SET_SHIFT
                | 1 << SETWAY_LEVEL_SHIFT;
             asm volatile ("mcr p15, 0, %0, c7, c6,  2" : : "r" (nSetWayLevel) : "memory");   // DCISW
          }
       }

   } else {

       // invalidate L2 unified cache
       for (nSet = 0; nSet < PI3_L2_CACHE_SETS; nSet++) {
          for (nWay = 0; nWay < PI3_L2_CACHE_WAYS; nWay++) {
             nSetWayLevel = nWay << PI3_L2_SETWAY_WAY_SHIFT
                | nSet << L2_SETWAY_SET_SHIFT
                | 1 << SETWAY_LEVEL_SHIFT;
             asm volatile ("mcr p15, 0, %0, c7, c6,  2" : : "r" (nSetWayLevel) : "memory");   // DCISW
          }
       }

   }

}

void CleanDataCache (void)
{
   unsigned nSet;
   unsigned nWay;
   uint32_t nSetWayLevel;
   // clean L1 data cache
   for (nSet = 0; nSet < L1_DATA_CACHE_SETS; nSet++) {
      for (nWay = 0; nWay < L1_DATA_CACHE_WAYS; nWay++) {
         nSetWayLevel = nWay << L1_SETWAY_WAY_SHIFT
            | nSet << L1_SETWAY_SET_SHIFT
            | 0 << SETWAY_LEVEL_SHIFT;
         asm volatile ("mcr p15, 0, %0, c7, c10,  2" : : "r" (nSetWayLevel) : "memory");
      }
   }

   if (_get_hardware_id() == _RPI2) { //Raspberry PI 2

       // clean L2 unified cache
       for (nSet = 0; nSet < PI2_L2_CACHE_SETS; nSet++) {
          for (nWay = 0; nWay < PI2_L2_CACHE_WAYS; nWay++) {
             nSetWayLevel = nWay << PI2_L2_SETWAY_WAY_SHIFT
                | nSet << L2_SETWAY_SET_SHIFT
                | 1 << SETWAY_LEVEL_SHIFT;
             asm volatile ("mcr p15, 0, %0, c7, c10,  2" : : "r" (nSetWayLevel) : "memory");
          }
       }

   } else {

       // clean L2 unified cache
       for (nSet = 0; nSet < PI3_L2_CACHE_SETS; nSet++) {
          for (nWay = 0; nWay < PI3_L2_CACHE_WAYS; nWay++) {
             nSetWayLevel = nWay << PI3_L2_SETWAY_WAY_SHIFT
                | nSet << L2_SETWAY_SET_SHIFT
                | 1 << SETWAY_LEVEL_SHIFT;
             asm volatile ("mcr p15, 0, %0, c7, c10,  2" : : "r" (nSetWayLevel) : "memory");
          }
       }

   }
}


// TLB 4KB Section Descriptor format
// 31..12 Section Base Address
// 11..9        - unused, set to zero
// 8..6   TEX   - type extension- TEX, C, B used together, see below
// 5..4   AP    - access ctrl   - set to 11 for full access from user and super modes
// 3      C     - cacheable     - TEX, C, B used together, see below
// 2      B     - bufferable    - TEX, C, B used together, see below
// 1      1
// 0      1

void map_4k_page(int logical, int physical) {
   // Invalidate the data TLB before changing mapping
   _invalidate_dtlb_mva((void *)(logical << 12));
   // Setup the 4K page table entry
   // Second level descriptors use extended small page format so inner/outer cacheing can be controlled
   // Pi 0/1:
   //   XP (bit 23) in SCTRL is 0 so descriptors use ARMv4/5 backwards compatible format
   // Pi 2/3:
   //   XP (bit 23) in SCTRL no longer exists, and we see to be using ARMv6 table formats
   //   this means bit 0 of the page table is actually XN and must be clear to allow native ARM code to execute
   //   (this was the cause of issue #27)
   if (_get_hardware_id() >= _RPI2) {
       PageTable2[logical] = (physical<<12) | 0x132 | (bb << 6) | (aa << 2);
   } else {
       PageTable2[logical] = (physical<<12) | 0x133 | (bb << 6) | (aa << 2);
   }
}

void enable_MMU_and_IDCaches(int cached_screen_area, int cached_screen_size)
{

   log_debug("enable_MMU_and_IDCaches");
   //log_debug("cpsr    = %08x", _get_cpsr());

   unsigned i;
   unsigned base;

   // TLB 1MB Sector Descriptor format
   // 31..20 Section Base Address
   // 19     NS    - ?             - set to 0
   // 18     0     -               - set to 0
   // 17     nG    - ?             - set to 0
   // 16     S     - ?             - set to 0

   // 15     APX   - access ctrl   - set to 0 for full access from user and super modes
   // 14..12 TEX   - type extension- TEX, C, B used together, see below

   // 11..10 AP    - access ctrl   - set to 11 for full access from user and super modes
   // 9      P     -               - set to 0
   // 8..5   Domain- access domain - set to 0000 as nor using access ctrl
   // 4      XN    - eXecute Never - set to 1 for I/O devices

   // 3      C     - cacheable     - set to 1 for cachable RAM i
   // 2      B     - bufferable    - set to 1 for cachable RAM
   // 1      1                     - TEX, C, B used together, see below
   // 0      0                     - TEX, C, B used together, see below

   // For I/O devices
   // TEX = 000; C=0; B=1 (Shared device)

   // For cacheable RAM
   // TEX = 001; C=1; B=1 (Outer and inner write back, write allocate)

   // For non-cachable RAM
   // TEX = 001; C=0; B=0 (Outer and inner non-cacheable)

   // For individual control
   // TEX = 1BB CB=AA
   // AA = inner policy
   // BB = outer policy
   // 00 = NC    (non-cacheable)
   // 01 = WBWA  (write-back, write allocate)
   // 10 = WT    (write-through
   // 11 = WBNWA (write-back, no write allocate)
   /// TEX = 100; C=0; B=1 (outer non cacheable, inner write-back, write allocate)

   for (base = 0; base < l1_cached_threshold; base++) // 0x04000000 64MB
   {
      // Value from my original RPI code = 11C0E (outer and inner write back, write allocate, shareable)
      // bits 11..10 are the AP bits, and setting them to 11 enables user mode access as well
      // Values from RPI2 = 11C0E (outer and inner write back, write allocate, shareable (fast but unsafe)); works on RPI
      // Values from RPI2 = 10C0A (outer and inner write through, no write allocate, shareable)
      // Values from RPI2 = 15C0A (outer write back, write allocate, inner write through, no write allocate, shareable)
      PageTable[base] = base << 20 | 0x04C02 | (shareable << 16) | (bb << 12) | (aa << 2);
   }
   for (; base < l2_cached_threshold; base++) // 0x08000000 128MB
   {
      PageTable[base] = base << 20 | 0x04C02 | (shareable << 16) | (bb << 12);
   }
   for (; base < (_get_peripheral_base() >> 20); base++)
   {
      PageTable[base] = base << 20 | 0x01C02;
   }
   for (; base < 4096; base++)
   {
      // shared device, never execute
      PageTable[base] = base << 20 | 0x10C16;
   }

#if defined(USE_CACHED_SCREEN)
   if (cached_screen_area != 0) {
       for (base = (cached_screen_area >> 20); base < ((cached_screen_area + cached_screen_size) >> 20); base++)
       {
          PageTable[base] = base << 20 | 0x04C02 | (shareable << 16) | (bb << 12) | (aa << 2);  //cached part of screen ram
       }
   }
#endif

   // suppress a warning as we really do want to copy from src address 0!
#pragma GCC diagnostic push
#pragma GCC diagnostic ignored "-Wnonnull"
   // copy vectors from virtual address zero to a higher unused location
   // cppcheck-suppress nullPointer
   memcpy((void *)HIGH_VECTORS_BASE, (void *)0, 0x1000);
#pragma GCC diagnostic pop

   // replace the first N 1MB entries with second level page tables, giving N x 256 4K pages
   for (i = 0; i < NUM_4K_PAGES >> 8; i++)
   {
      PageTable[i] = (unsigned int) (&PageTable2[i << 8]);
      PageTable[i] +=1;
   }

   // populate the second level page tables
   for (base = 0; base < NUM_4K_PAGES; base++)
   {
      map_4k_page(base, base);
   }

   // relocate the vector pointer to the moved page
   asm volatile("mcr p15, 0, %[addr], c12, c0, 0" : : [addr] "r" (HIGH_VECTORS_BASE));

   if (_get_hardware_id() >= _RPI3) {

       unsigned cpuextctrl0, cpuextctrl1;
       asm volatile ("mrrc p15, 1, %0, %1, c15" : "=r" (cpuextctrl0), "=r" (cpuextctrl1));
       //log_debug("extctrl = %08x %08x", cpuextctrl1, cpuextctrl0);

   } else {

       // RPI:  bit 6 of auxctrl is restrict cache size to 16K (no page coloring)
       // RPI2: bit 6 of auxctrl is set SMP bit, otherwise all caching disabled
       unsigned auxctrl;
       asm volatile ("mrc p15, 0, %0, c1, c0,  1" : "=r" (auxctrl));
       auxctrl |= 1 << 6;
       asm volatile ("mcr p15, 0, %0, c1, c0,  1" :: "r" (auxctrl));
       asm volatile ("mrc p15, 0, %0, c1, c0,  1" : "=r" (auxctrl));
       //log_debug("auxctrl = %08x", auxctrl);
   }

   // set domain 0 to client
   asm volatile ("mcr p15, 0, %0, c3, c0, 0" :: "r" (1));

   // always use TTBR0
   asm volatile ("mcr p15, 0, %0, c2, c0, 2" :: "r" (0));

   unsigned ttbcr;
   asm volatile ("mrc p15, 0, %0, c2, c0, 2" : "=r" (ttbcr));
   //log_debug("ttbcr   = %08x", ttbcr);

   if (_get_hardware_id() >= _RPI2) {
       // set TTBR0 - page table walk memory cacheability/shareable
       // [Bit 0, Bit 6] indicates inner cachability: 01 = normal memory, inner write-back write-allocate cacheable
       // [Bit 4, Bit 3] indicates outer cachability: 01 = normal memory, outer write-back write-allocate cacheable
       // Bit 1 indicates sharable
       // 4A = 0100 1010
       int attr = ((aa & 1) << 6) | (bb << 3) | (shareable << 1) | ((aa & 2) >> 1);
       asm volatile ("mcr p15, 0, %0, c2, c0, 0" :: "r" (attr | (unsigned) &PageTable));
   } else {
       // set TTBR0 (page table walk inner cacheable, outer non-cacheable, shareable memory)
       asm volatile ("mcr p15, 0, %0, c2, c0, 0" :: "r" (0x03 | (unsigned) &PageTable));
   }
   unsigned ttbr0;
   asm volatile ("mrc p15, 0, %0, c2, c0, 0" : "=r" (ttbr0));
   //log_debug("ttbr0   = %08x", ttbr0);


   // Invalidate entire data cache
   if (_get_hardware_id() >= _RPI2) {
       asm volatile (".word 0xf57ff06f" ::: "memory");        // asm volatile ("isb" ::: "memory"); (won't compile on arm v6)
       InvalidateDataCache();
   } else {
       // invalidate data cache and flush prefetch buffer
       // NOTE: The below code seems to cause a Pi 2 to crash
       asm volatile ("mcr p15, 0, %0, c7, c5,  4" :: "r" (0) : "memory");
       asm volatile ("mcr p15, 0, %0, c7, c6,  0" :: "r" (0) : "memory");
   }

   // enable MMU, L1 cache and instruction cache, L2 cache, write buffer,
   //   branch prediction and extended page table on
   unsigned sctrl;
   asm volatile ("mrc p15,0,%0,c1,c0,0" : "=r" (sctrl));
   // Bit 13 enable vector relocation
   // Bit 12 enables the L1 instruction cache
   // Bit 11 enables branch pre-fetching
   // Bit  2 enables the L1 data cache
   // Bit  0 enabled the MMU
   // The L1 instruction cache can be used independently of the MMU
   // The L1 data cache will one be enabled if the MMU is enabled

   sctrl |= 0x00001805;
   asm volatile ("mcr p15,0,%0,c1,c0,0" :: "r" (sctrl) : "memory");
   asm volatile ("mrc p15,0,%0,c1,c0,0" : "=r" (sctrl));
   //log_debug("sctrl   = %08x", sctrl);

   // For information, show the cache type register
   // From this you can tell what type of cache is implemented
   unsigned ctype;
   asm volatile ("mrc p15,0,%0,c0,c0,1" : "=r" (ctype));
   //log_debug("ctype   = %08x", ctype);
}
