#ifndef SAMPLE_PREFETCHER_H
#define SAMPLE_PREFETCHER_H

#include <stdio.h>

#include <map>
#include <fstream>
#include <iomanip>
#include <iostream>
#include  <list>
#include  <vector>
#include  <deque>
#include  <set>
#include  <map>

using namespace std;

///////////////////////////////////////////////////////////////////////////////
// Basic Classes and Templates
///////////////////////////////////////////////////////////////////////////////

/// Nodes in the LRU-like queues
template<class ItemType>
class _List_Node
{
  public:
    ItemType _item;
    _List_Node* _next;
    _List_Node* _prev;
    
    _List_Node(ItemType item = ItemType())
        :_item(item), 
         _next(0), 
         _prev(0)
    {}

    _List_Node(_List_Node& node)
        :_item(node._item), 
         _next(0), 
         _prev(0)
    {}

    ~_List_Node(){}
};

/// template class for any LRU-like queues.
/// The queue is implemented in a form of double-linked list.
/// The template provides constructor/destructor, and a "bring to head" method.
/// Inherite from LRUList<ItemType> to create your new queue class,
/// where ItemType is the class of queue items. 
/// Requirement of ItemType:
/// 	o. It should provide default/copy constructors.
///		o. It should provide a Dump() routine for dumpping out its content to stdout. This is useful in debugging.
template<class ItemType>
class LRUList{
  public:
    typedef _List_Node<ItemType> Node;
  protected:
    Node* mListHead;
    Node* mListTail;
    UINT32 mListLength;

    /// Brings the node to the head of the queue.
    /// This makes it Most Recently Used
    void BringsToHead(Node* ptr){

        if( ptr->_prev != NULL ) 
        {
            // Pull the entry out chain previous and next elements
            // to each other
            ptr->_prev->_next = ptr->_next;
            if( ptr->_next ) 
            {
                ptr->_next->_prev = ptr->_prev;
            }
            else 
            {
                // If tail is being removed, set the tail to the
                // previous guy in the link-list
                mListTail = ptr->_prev;
            }

            // Since we are bringing this to the head, next
            // element is going to be the current head.  
            ptr->_next        = mListHead;
            ptr->_prev        = NULL;
                
            // set current head's previous to this
            mListHead->_prev   = ptr;
                
            // This is now the current head
            mListHead          = ptr;
        }
    }

    /// Brings the node to the tail of the queue.
    /// This makes it Least Recently Used
    void BringsToTail(Node* ptr){
        if( ptr->_next != NULL ) 
        {
            // Pull the entry out chain previous and next elements
            // to each other
            ptr->_next->_prev = ptr->_prev;
            if( ptr->_prev ) 
            {
                ptr->_prev->_next = ptr->_next;
            }
            else 
            {
                // If head is being removed, set the head to the
                // next guy in the link-list
                mListHead        = ptr->_next;
            }

            // Since we are bringing this to the tail, next
            // element is going to be the current tail.  
            ptr->_prev        = mListTail;
            ptr->_next        = NULL;

            mListTail->_next   = ptr;

            // This is now the current tail
            mListTail          = ptr;
        }
    }

  public:
    /// Create the list with specified length.
    LRUList(UINT32 listLength): 
        mListHead(0), 
        mListTail(0), 
        mListLength(listLength) 
    {
        Node* prevNode = 0;
        for (UINT32 i=0; i<listLength; i++){
            Node* node = new Node();
            if (mListHead == 0) mListHead = node;
            node->_prev = prevNode;
            if (prevNode) prevNode->_next = node;
            prevNode = node;
        }
        mListTail = prevNode;
    }

    ~LRUList()
    {
        while (mListHead)
        {
            Node* node = mListHead->_next;
            delete mListHead;
            mListHead = node;
        }
    }

    /// Utillity routine for debugging in gdb. Dump out the list.
    void Dump()
    {
        Node* node=mListHead;
        for (UINT32 i=0; i<mListLength; i++, node=node->_next)
        {
            ;//cout<<i<<": ";
            if (node) node->_item.Dump();
            ;//cout<<endl;
        }
    }
};

///////////////////////////////////////////////////////////////////////////////
// Stride Prefetching
///////////////////////////////////////////////////////////////////////////////

class MemLogEntry{
  public:
    ADDRINT ip; // IP of the mem instruction
    ADDRINT last_mem_addr; // the last mem addr
    INT32   stride; // stride detected
    UINT32  count; // the mem addr is accessed in stride for "count" times
    bool    trained;
    
    /// default constructor
    MemLogEntry(ADDRINT ip = 0)
        :last_mem_addr(0),
         stride(0),
         count(0),
         trained(0)
    { 
        this->ip = ip; 
    }

    /// copy constructor
    MemLogEntry(const MemLogEntry& ent)
        :ip(ent.ip), 
         last_mem_addr(ent.last_mem_addr), 
         stride(ent.stride), 
         count(ent.count),
         trained(ent.trained)
    {
        // Nothing
    }

    ~MemLogEntry()
    {
        // Nothing
    }

    /// Dump routine
    void Dump(){
        ;//cout << hex << ip << "," << last_mem_addr << "," << dec <<stride << "," << count;
    }
};

class StridePrefetchTable
    : public LRUList<MemLogEntry> 
{
  private:
    map<ADDRINT, Node*> PClist;
    map<ADDRINT, Node*>::iterator iter;

  public:
    static const UINT32 DEFAULT_STRIDE_PREFETCH_TABLE_SIZE = 256;

    StridePrefetchTable(UINT32 tableSize=DEFAULT_STRIDE_PREFETCH_TABLE_SIZE)
        : LRUList<MemLogEntry>(tableSize){}

    ~StridePrefetchTable(){}

    /// Look-up in the prefetch table. This leeds to a subsequent LRU update in the table.
    /// Returns the entry if hit.
    MemLogEntry* AccessEntry(UINT32 threadId, ADDRINT ip, ADDRINT memAddr){

        // find the ip log in the table
        Node* node = mListHead;
        while (node && node->_item.ip!=ip) node=node->_next;

        MemLogEntry* entry;
        if ( node == NULL ) { 
            /// Replace tail
            node = mListTail;

            entry = &(node->_item);

            DelFrmPClist( entry->ip );
            entry->ip = ip;
            entry->last_mem_addr = memAddr;
            entry->stride = 0;
            entry->trained = false;
            AddToPClist( ip, node );
        }
        else         /// update
        {
            entry = &(node->_item);

//         ;//cout <<"Found IP: "<<hex<<ip
//              <<" Last Mem Addr: "<<entry->last_mem_addr
//              <<" Curr Mem Addr: "<<memAddr<<dec<<endl;

            INT32 newstride      = memAddr - entry->last_mem_addr;
            entry->last_mem_addr = memAddr;

            if (newstride && (newstride == entry->stride)) {
                ++(entry->count); 
                if( entry->count >= 1 ) 
                {
                  if(!(entry->trained))
                    entry->trained = true;
                }
            }
            else { 
                entry->count = 0; 
                entry->stride = newstride;
                entry->trained = false;
            }
        }

        BringsToHead(node);

        return entry;
    }


    Node* FindPC( ADDRINT ip )
    {
        iter = PClist.find(ip);
        if( iter != PClist.end() ) 
        {
            return iter->second;
        }

        return NULL;
    }

    inline void AddToPClist( ADDRINT ip, Node *tmp ) 
    {
        PClist[ ip ] = tmp;
    }

    inline void DelFrmPClist( ADDRINT ip ) 
    {
            PClist.erase( ip );
    }

};

class SamplePrefetcher : public StridePrefetchTable
{
  private:

    
  public:
    
    SamplePrefetcher() : StridePrefetchTable()
    {
    }
    
    ~SamplePrefetcher()
    {
    }
};



// ahmad's code begins here....
#ifndef  __GLOBALS_H__
#define  __GLOBALS_H__

#include  <map>
#include  <assert.h>
///#include  <boost/dynamic_bitset.hpp>
#include  <map>
#include  <deque>
#include  <queue>
#include  "interface.h"

// cache defines.
#define LINE_BITS (6)
#define LINE_SIZE (1<<LINE_BITS)
#define LINE_NUMBER(x) (x>>LINE_BITS)

#define REGION_SIZE (1<<REGION_BITS)
#define REGION_NUMBER(x) (x>>REGION_BITS)
#define REGION(x) (x&~((1<<REGION_BITS)-1))
#define REGION_OFFSET(x) (x&((1<<REGION_BITS)-1))

#define PDEBUG(x) ;//cout<<#x" is: "<<x<<endl;

///#define PRINT_DEC(x) ;//cout<<dec<<#x<<"addr: "<<(x).DataAddr<<" cycle: "<<(x).LastRequestCycle<<" pc: "<<(x).LastRequestAddr<<" seq: "<<(x).SequenceNumber<<" ld/st: "<<(x).AccessType<<endl;
///#define PRINT_DEC_ONLY(x) ;//cout<<dec<<(x).DataAddr<<" "<<(x).LastRequestCycle<<" "<<(x).LastRequestAddr<<" "<<(x).SequenceNumber<<" "<<(x).AccessType<<endl;
///#define PRINT_HEX(x) ;//cout<<hex<<#x<<"addr: "<<(x).DataAddr<<" cycle: "<<(x).LastRequestCycle<<" pc: "<<(x).LastRequestAddr<<" seq: "<<(x).SequenceNumber<<" ld/st: "<<(x).AccessType<<endl;
///#define PRINT_HEX_ONLY(x) ;//cout<<hex<<(x).DataAddr<<" "<<(x).LastRequestCycle<<" "<<(x).LastRequestAddr<<" "<<(x).SequenceNumber<<" "<<(x).AccessType<<endl;
///#define PRINT_HEX_ONLY(x) ;//cout<<hex<<(x).LastRequestAddr<<" "<<(x).DataAddr<<" "<<(x).SequenceNumber<<" "<<(x).LastRequestCycle<<" "<<(x).AccessType<<endl;
#define PRINT_DEC_ONLY(x) ;//cout<<dec<<setw(12)<<(x).LastRequestAddr<<setw(12)<<((x).DataAddr>>LINE_BITS)<<setw(8)<<(x).SequenceNumber<<setw(8)<<(x).LastRequestCycle<<setw(2)<<(x).AccessType<<endl;
#define PRINT_DEC_ONLY_NOENDL(x) ;//cout<<dec<<setw(12)<<(x).LastRequestAddr<<setw(12)<<((x).DataAddr>>LINE_BITS)<<setw(8)<<(x).SequenceNumber<<setw(8)<<(x).LastRequestCycle<<setw(2)<<(x).AccessType;

#define L1_ACCESS (1<<4)
#define IS_L1(x) (x&L1_ACCESS)

typedef std::map<CacheAddr_t, unsigned int> PCMap;
typedef std::vector<std::pair<CacheAddr_t, unsigned int> > PCVector;

typedef std::deque<CacheAddr_t> AddressList;
typedef std::vector< AddressList > Regions;
typedef std::map<CacheAddr_t, AddressList > LHB;

PCMap::iterator GetHighest(PCMap & l2MissPCs);
void PrintHighestPCs(PCMap & l2MissPCs);

typedef std::deque<PrefetchData_t> History;
typedef std::map<CacheAddr_t, History> PCHistory;
typedef std::deque<PrefetchData_t*> HistoryPtr;

#define HISTORY_SIZE 64
#define GLOBAL_HISTORY_SIZE 64
extern bool AddToHistory(History & h, PrefetchData_t & d, int type);
extern bool AddToPCHistory(PCHistory & h, PrefetchData_t & d, int type);
extern void PrintPCHistory(History & h, int mask);
extern void PrintGlobalHistory(History & h);
extern void PrintHistory(HistoryPtr & h);

template <class T, class U>
extern void PRINT_DELTAS(const T& coll, const U& a);
template <class T>
extern void PRINT_ABS(const T& coll);

template <class T, class U>
extern void GET_NN(T & output, U & input, CacheAddr_t current, CacheAddr_t threshold);

template <class T, class U>
extern void FindStride(T & region, U i, std::map<int, int > & strideMap);

template <class T, class V>
extern void GetN(V & output, T & region, typename T::iterator i, int n);

template <class T, class U>
extern void PRINT_DELTAS_P(const T & coll, const U & now);

template <class T, class U>
extern void GET_DELTAS(T&coll, U&output);

#endif  /*__GLOBALS_H__*/

#ifndef  __BASE_PREFETCHER_H__
#define  __BASE_PREFETCHER_H__

///#include  "globals.h"

struct PrefetchInfo
{
  COUNTER cycle;
  unsigned int hits;
  CacheAddr_t delta;
  PrefetchInfo(COUNTER c, CacheAddr_t d)
  {
    hits=0;
    cycle=c;
    delta=d;
  }
  PrefetchInfo()
  {
    assert(0);
  }
};


#if 1
// This is the ideal prefetch filter... it tracks N prefetches issued so far and 
// removes duplicates.

// Algorithm for checking if previously set....
// Upon prefetch, look in the queue. If hit, make it MRU.
// Go through the list again, see if any is in the cache. If so make it LRU.
struct PrefetchStats
{
  // This map is used only for SIMULATION PERFORMANCE reasons...
  // It is not included in storage budget... I could do a lookup using l alone, but map
  // is a red-black tree that gives me O(logn) lookup time instead of O(n).
  // again, sim perf is good with map. :)
  std::map<CacheAddr_t, PrefetchInfo> m;
  AddressList l;

  unsigned int max;

  COUNTER cycles;
  COUNTER total;
  COUNTER dupe;
  COUNTER hits;
  COUNTER wasted;
  PrefetchStats(int s)
  {
    cycles=0;
    total=0;
    dupe=0;
    wasted=0;
    max=s;
  }
  void Demand(COUNTER cycle, CacheAddr_t a)
  {
    if(m.find(a)!=m.end())
    {
      assert(cycle>=m[a].cycle);
      if(m[a].hits==0)
      {
        ;//signed long long int d=m[a].delta;
        ;//cout<<"PHIT: "<<(a>>LINE_BITS)<<" "<<m[a].cycle<<" "<<cycle-m[a].cycle<<" "<< (d>>LINE_BITS) <<endl;
        ;//cout<<"total: "<<total<<" hits: "<<hits<<" dupe: "<<dupe<<" cycles: "<<cycles<<endl;
        cycles+=(cycle-m[a].cycle);
        hits++;
      }
      m[a].hits++;
    }
  }
  bool Prefetch(COUNTER cycle, CacheAddr_t a, PrefetchData_t * trigger)
  {
    total++;
    if(m.find(a)==m.end())
    {
      // See if we need to shift the deque
      if(l.size()==max)
      {
        std::map<CacheAddr_t, PrefetchInfo>::iterator i;
        i=m.find(l.front());
        assert(i!=m.end());
        if(i->second.hits==0)
        {
          ;//cout<<"WASTED PREFETCH: "<< LINE_NUMBER(i->first)<<endl;
          wasted++;
        }
        m.erase(i);
        l.pop_front();
      }
      l.push_back(a);
      CacheAddr_t b=(a-trigger->DataAddr);
      ;//cout<<"PREFETCHING: "<<(((signed long long int)b)>>LINE_BITS) ;
      m.insert(std::make_pair<CacheAddr_t, PrefetchInfo>(a, PrefetchInfo(cycle, b)));
      assert(m[a].cycle==cycle);
      return true;
    }
    else
    {
      dupe++;
      return false;
    }
  }
  void PrintStats()
  {
    COUNTER average;
    ;//cout<<"Printing prefetcher stats: "<<endl;
    average=(hits==0)?0:cycles/hits;
    ;//cout<<"total: "<<total<<" unique: "<<total-dupe<<" hits: "<<hits<<" wasted: "<<wasted<<" dupe: "<<dupe<<" cycles: "<<cycles<<" avg_cycles: "<<average<<endl;
  }
};
#else
#endif

class BasePrefetcher
{
  public:
  bool printL1;
  bool printL2;
  PrefetchStats l1Stats, l2Stats;

  COUNTER l1Hits;
  COUNTER l1Misses;
  COUNTER l2Hits;
  COUNTER l2Misses;

  COUNTER l2PrefetchesBlocked;
  COUNTER l1PrefetchesBlocked;

  BasePrefetcher() : l1Stats(32), l2Stats(32)
  {
    l1Hits=0; l1Misses=0;
    l2Hits=0; l2Misses=0;
    printL1=true;
    printL2=true;
    l1PrefetchesBlocked=0;l2PrefetchesBlocked=0;
  }

  void IssuePrefetches( COUNTER cycle, PrefetchData_t *L1Data, PrefetchData_t * L2Data )
  {
    for( unsigned int i=0 ; i<4 ; i++ )
    {
      if(L1Data[i].LastRequestCycle==cycle && L1Data[i].LastRequestPrefetch==false)
      {
        if(L1Data[i].hit==true)
          l1Hits++;
        else
          l1Misses++;
        // Demand request to the L1 cache is here.
        if(printL1)
        {
          ;//cout<<"\tL1";
          if(L1Data[i].hit==false)
            ;//cout<<"M";
          else
            ;//cout<<"H";
          PRINT_DEC_ONLY(L1Data[i]);
        }
        l1Stats.Demand(cycle, L1Data[i].DataAddr);
      }
    }

    if(L2Data[0].LastRequestCycle==cycle && L2Data[0].LastRequestPrefetch==false)
    {
      if(L2Data[0].hit==true)
        l2Hits++;
      else
        l2Misses++;

      ;//cout<<"\tL2";
      if(L2Data[0].hit==false)
        ;//cout<<"M";
      else
        ;//cout<<"H";
      PRINT_DEC_ONLY(L2Data[0]);
      // Demand request to L2 cache is here.
      // See if we hit a request we already prefetched.
      l2Stats.Demand(cycle, L2Data[0].DataAddr);
    }
///    ;//cout<<"In BASE\n"<<endl;
  }

  inline void IssueL2(COUNTER cycle, CacheAddr_t a, PrefetchData_t * trigger)
  {
///    ;//cout<<"IN BASE IL2"<<endl;
    if(GetPrefetchBit(1, a)==-1)
    {
      if(true==l2Stats.Prefetch(cycle, a, trigger))
      {
        if(1==IssueL2Prefetch(cycle, a))
        {
          l2PrefetchesBlocked++;
          ;//cout<<" BLOCKED!"<<endl;
        }
        else
          ;//cout<<endl;
      }
    }
  }

  inline void IssueL1(COUNTER cycle, CacheAddr_t a, PrefetchData_t * trigger)
  {
    if(GetPrefetchBit(0, a)==-1)
    {
      if(true==l1Stats.Prefetch(cycle, a, trigger))
      {
        if(1==IssueL1Prefetch(cycle, a))
        {
          l1PrefetchesBlocked++;
        }
      }
    }
  }

  void PrintStats()
  {
    l1Stats.PrintStats();
    l2Stats.PrintStats();
    ;//cout<<"L1 hits: "<<l1Hits<<" misses: "<<l1Misses<<endl;
    ;//cout<<"L2 hits: "<<l2Hits<<" misses: "<<l2Misses<<endl;
    ;//cout<<"l1PrefetchesBlocked "<<l1PrefetchesBlocked<<endl;
    ;//cout<<"l2PrefetchesBlocked "<<l2PrefetchesBlocked<<endl;
  }
};


#endif  /*__BASE_PREFETCHER_H__*/

#ifndef  __NNL_PREFETCHER_H__
#define  __NNL_PREFETCHER_H__

// TODO: Some form of history compression possible? 
// Possibly use region: offset... have to deal with potentially large number of regions
// especially in the GHB. 
// In the LHB, this should be possible....
//
// Cross-region analysis to figure out what is missing?
//
// FILTERING IDEAS: GHB of recently done prefetches. done!
// For geopattern, do the check on number+cachelines, instead of double value...
// For loops, n, n+1 is not very good for most cases... Need to remove those!
// USE SCORE... that's what is good for.
//
// FIXME: i am using .end() instead of latest for l2 misses.

///#include  "globals.h"
///#include  "cluster.h"
///#include  "base_prefetcher.h"
#include  <cmath>

struct MyPrefetchData_t : public PrefetchData_t
{
  MyPrefetchData_t(PrefetchData_t & d, int type) : PrefetchData_t(d)
  {
    AccessType^=type;
  }
  MyPrefetchData_t()
  {
  }
};

// Time sorted addresses
typedef deque<MyPrefetchData_t> TSA;
// Space sorted addresses
typedef list<MyPrefetchData_t> SSA;



struct SLHBL
{
  // for global history buffer, PC is not used at all... so it is not counted.
  CacheAddr_t PC;
  TSA tsa;

  // in hardware, this wont be seperate... this is only seperate for SIMULATION PERFORMANCE!
  SSA ssa;
  unsigned int max;
  SLHBL(unsigned int m)
  {
    max=m;
  }
  void Clear()
  {
    tsa.clear();
    ssa.clear();
  }
  bool Find(CacheAddr_t a)
  {
    for( TSA::reverse_iterator i=tsa.rbegin() ; i!=tsa.rend() ; i++ )
    {
      if(i->DataAddr==a)
        return true;
    }
    return false;
  }
  bool Find(MyPrefetchData_t * d, TSA::iterator & i)
  {
    for( i=tsa.begin() ; i!=tsa.end() ; i++ )
    {
      if(i->DataAddr==d->DataAddr)
        return true;
    }
    return false;
  }
  bool Add(MyPrefetchData_t * d)
  {
    TSA::iterator i;
    // If we can find it, don't add it.
    if(Find(d, i)==true)
    {
      if(IS_L1(d->AccessType)==0 && IS_L1(i->AccessType)!=0)
      {
        // We need to change it to L2.
        i->AccessType=d->AccessType;
        for( SSA::iterator j=ssa.begin() ; j!=ssa.end() ; j++ )
        {
          if(j->DataAddr==d->DataAddr)
          {
            j->AccessType=d->AccessType;
          }
        }
        return true;
      }
      return false;
    }
    assert(tsa.size()==ssa.size());
    if(tsa.size()==max)
    {
      RemoveFront();
    }
    PushBack(d);
    return true;
  }
  void AddWithoutCheck(MyPrefetchData_t * d)
  {
    Add(d);
  }
  void PushBack(MyPrefetchData_t * d)
  {
    tsa.push_back(*d);
    SSA::iterator i;
    for( i=ssa.begin() ; i!=ssa.end() ; i++ )
    {
      if(i->DataAddr>d->DataAddr)
        break;
    }
    ssa.insert(i, *d);
  }
  void RemoveFront()
  {
    if(tsa.size()==0)
      return;
    MyPrefetchData_t & d=*(tsa.begin());
    SSA::iterator i=ssa.begin();
    while(i->DataAddr!=d.DataAddr)
      i++;
    assert(i!=ssa.end());
    ssa.erase(i);
    tsa.pop_front();
  }
  void Print()
  {
#if 0
    Clusterer c;
    // to keep the clusterer happy
    unsigned int total;
    priority_queue< pair<unsigned int, CacheAddr_t> >  sorted;

    AddressList l2Hits;
    AddressList l2Misses;
    AddressList l1Hits;
    AddressList l1Misses;
    AddressList all;
    if(tsa.begin()==tsa.end())
    {
      ;//cout<<"List empty!"<<endl;
      return;
    }
    ;//cout<<"ALL:"<<endl;
    for( TSA::iterator i=tsa.begin() ; i!=tsa.end() ; i++ )
    {
      ;//cout<<LINE_NUMBER(i->DataAddr);
      if(i->AccessType&L1_ACCESS)
        ;//cout<<"_L1";
      else
        ;//cout<<"_L2";
      if(i->hit==true)
        ;//cout<<"_H";
      else
        ;//cout<<"_M";
      ;//cout<<" ";
      if( (i->AccessType&L1_ACCESS) && i->hit==true)
      {
        l1Hits.push_back(LINE_NUMBER(i->DataAddr));
      }
      else if((i->AccessType&L1_ACCESS) && i->hit==false)
      {
        l1Misses.push_back(LINE_NUMBER(i->DataAddr));
      }
      else if((i->AccessType&L1_ACCESS)==0 && i->hit==true)
      {
        l2Hits.push_back(LINE_NUMBER(i->DataAddr));
      }
      else if((i->AccessType&L1_ACCESS)==0 && i->hit==false)
      {
        l2Misses.push_back(LINE_NUMBER(i->DataAddr));
      }
      all.push_back(LINE_NUMBER(i->DataAddr));
    }
    ;//cout<<endl;
    ;//cout<<"All Deltas:"<<endl;
    PRINT_DELTAS(all, LINE_NUMBER(tsa.rbegin()->DataAddr));
    ;//cout<<endl;
    ;//cout<<"L1Hits:"<<endl;
    PRINT_DELTAS(l1Hits, LINE_NUMBER(tsa.rbegin()->DataAddr));
    ;//cout<<endl;
    ;//cout<<"L1Misses:"<<endl;
    PRINT_DELTAS(l1Misses, LINE_NUMBER(tsa.rbegin()->DataAddr));
    ;//cout<<endl;
    ;//cout<<"L2Hits:"<<endl;
    PRINT_DELTAS(l2Hits, LINE_NUMBER(tsa.rbegin()->DataAddr));
    ;//cout<<endl;
    ;//cout<<"L2Misses:"<<endl;
    PRINT_DELTAS(l2Misses, LINE_NUMBER(tsa.rbegin()->DataAddr));
    ;//cout<<endl;
    if(l2Misses.size()>20)
    {
      c.Cluster(l2Misses, sorted, total);
      ;//cout<<endl;
    }

    ;//cout<<"SORTED"<<endl;
    all.clear();l1Hits.clear();l1Misses.clear();l2Hits.clear();l2Misses.clear();
    for( SSA::iterator i=ssa.begin() ; i!=ssa.end() ; i++ )
    {
      ;//cout<<LINE_NUMBER(i->DataAddr);
      if(i->AccessType&L1_ACCESS)
        ;//cout<<"_L1";
      else
        ;//cout<<"_L2";
      if(i->hit==true)
        ;//cout<<"_H";
      else
        ;//cout<<"_M";
      ;//cout<<" ";
      if( (i->AccessType&L1_ACCESS) && i->hit==true)
      {
        l1Hits.push_back(LINE_NUMBER(i->DataAddr));
      }
      else if((i->AccessType&L1_ACCESS) && i->hit==false)
      {
        l1Misses.push_back(LINE_NUMBER(i->DataAddr));
      }
      else if((i->AccessType&L1_ACCESS)==0 && i->hit==true)
      {
        l2Hits.push_back(LINE_NUMBER(i->DataAddr));
      }
      else if((i->AccessType&L1_ACCESS)==0 && i->hit==false)
      {
        l2Misses.push_back(LINE_NUMBER(i->DataAddr));
      }
      all.push_back(LINE_NUMBER(i->DataAddr));
    }
    ;//cout<<endl;
    ;//cout<<"All Deltas:"<<endl;
    PRINT_DELTAS(all, LINE_NUMBER(tsa.rbegin()->DataAddr));
    ;//cout<<endl;
    ;//cout<<"L1Hits:"<<endl;
    PRINT_DELTAS(l1Hits, LINE_NUMBER(tsa.rbegin()->DataAddr));
    ;//cout<<endl;
    ;//cout<<"L1Misses:"<<endl;
    PRINT_DELTAS(l1Misses, LINE_NUMBER(tsa.rbegin()->DataAddr));
    ;//cout<<endl;
    ;//cout<<"L2Hits:"<<endl;
    PRINT_DELTAS(l2Hits, LINE_NUMBER(tsa.rbegin()->DataAddr));
    ;//cout<<endl;
    ;//cout<<"L2Misses:"<<endl;
    PRINT_DELTAS(l2Misses, LINE_NUMBER(tsa.rbegin()->DataAddr));
    ;//cout<<endl;

#if 0
    CacheAddr_t a;
    ;//cout<<"All Deltas Sorted:"<<endl;
    a=0;
    for( SSA::iterator i=ssa.begin() ; i!=ssa.end() ; i++ )
    {
      ;//cout<<(signed long long)(LINE_NUMBER(i->DataAddr)-a);
      if(i->AccessType&L1_ACCESS)
        ;//cout<<"_L1";
      else
        ;//cout<<"_L2";
      if(i->hit==true)
        ;//cout<<"_H";
      else
        ;//cout<<"_M";
      ;//cout<<" ";
      a=LINE_NUMBER(i->DataAddr);
    }
    ;//cout<<endl;
#endif
#endif
  }
  void PrintTSA()
  {
    AddressList al;
    for( TSA::iterator i=tsa.begin() ; i!=tsa.end() ; i++ )
    {
      al.push_back(LINE_NUMBER(i->DataAddr));
    }

    if(al.size()>1)
    {
      ;//cout<<"PRINTING TSA"<<endl;
      PRINT_DELTAS(al, LINE_NUMBER(*(al.rbegin())) );
      ;//cout<<endl;
    }
  }
};

class NNLPrefetcher : public BasePrefetcher
{
  public:
  SLHBL g;
  list<SLHBL> l;

  unsigned int lhl;
  unsigned int ghl;
  unsigned int nlh;

  unsigned int hits;
  unsigned int misses;

  bool printLocal;

  map<CacheAddr_t, int> l1Prefetches;
  map<CacheAddr_t, int> l2Prefetches;

  PrefetchData_t * currentP;

  NNLPrefetcher(unsigned int localHistoryLength, unsigned int numLocalHistories, unsigned int globalHistoryLength)
    : g(globalHistoryLength)
  {
    lhl=localHistoryLength;
    ghl=globalHistoryLength;
    nlh=numLocalHistories;

    hits=0;
    misses=0;

    l.resize(nlh, lhl);
  }
  void IssuePrefetches( COUNTER cycle, PrefetchData_t *L1Data, PrefetchData_t * L2Data )
  {
    BasePrefetcher::IssuePrefetches(cycle, L1Data, L2Data);
    list<SLHBL>::iterator it;
    bool added=false;
    bool addedLocal=false;

    for( unsigned int i=0 ; i<4 ; i++ )
    {
      if(L1Data[i].LastRequestCycle==cycle && L1Data[i].LastRequestPrefetch==false)
      {
        MyPrefetchData_t d(L1Data[i], L1_ACCESS);
        currentP=L2Data;

        added=g.Add(&d);
        if(added)
        {
          PrefetchPattern(cycle, &d, g);
        }

        addedLocal=AddLocal(&d, cycle, it);
        if(addedLocal)
        {
          PrefetchPattern(cycle, &d, *(l.rbegin()) );
          PrefetchGeoPattern(cycle, &d, *(l.rbegin()) );
        }
      }
    }

    if(L2Data[0].LastRequestCycle==cycle&&L2Data[0].LastRequestPrefetch==false)
    {
      MyPrefetchData_t d(L2Data[0], 0);
      added=g.Add(&d);
      addedLocal=AddLocal(&d, cycle, it);
      PDEBUG(added);
      PDEBUG(addedLocal);

#if 1
      if(added)
      {
        PrintPattern(cycle, &d, g);
        PrefetchPattern(cycle, &d, g);
        PrefetchSSAPattern(cycle, &d, g);
      }
      if(addedLocal)
      {
        PrintPattern(cycle, &d, *(l.rbegin()) );
        PrefetchPattern(cycle, &d, *(l.rbegin()) );
      }
#endif

    }
    FinallyIssue();
///    if(cycle>9000)
///      exit(0);
  }
  bool TestGlobalAndLocal(CacheAddr_t a)
  {
    if(g.Find(a)==true)
      return true;
    for( list<SLHBL>::reverse_iterator i=l.rbegin() ; i!=l.rend() ; i++ )
    {
      if(i->Find(a)==true)
        return true;
    }
    return false;
  }

  void FinallyIssue()
  {
    for( map<CacheAddr_t, int>::iterator i=l1Prefetches.begin() ; i!=l1Prefetches.end() ; i++ )
    {
      if(TestGlobalAndLocal(i->first)==false)
        IssueL2(currentP->LastRequestCycle, i->first, currentP);
    }
    for( map<CacheAddr_t, int>::iterator i=l2Prefetches.begin() ; i!=l2Prefetches.end() ; i++ )
    {
      if(TestGlobalAndLocal(i->first)==false)
        IssueL2(currentP->LastRequestCycle, i->first, currentP);
    }
    l2Prefetches.clear();
    l1Prefetches.clear();
  }
  void PrintPattern(COUNTER cycle, MyPrefetchData_t * d, SLHBL & buffer)
  {
    // Get raw addresses of L2 misses from the time-ordered buffer and see if there is any 
    // pattern in them.
    AddressList tsaL2Misses;
    AddressList tsaAccesses;
    AddressList all;
    AddressList allSorted;
    for( TSA::iterator i=buffer.tsa.begin() ; i!=buffer.tsa.end() ; i++ )
    {
///      ;//cout<<"Comparing: "<<LINE_NUMBER(i->DataAddr)<<" and "<<LINE_NUMBER(d->DataAddr);
      if( IS_L1(i->AccessType)==0 && labs(i->DataAddr-d->DataAddr)<= (1<<(10+LINE_BITS)) )
      {
///        ;//cout<<" MATCH"<<endl;
        tsaL2Misses.push_back(LINE_NUMBER(i->DataAddr) );
      }
      if( labs(i->DataAddr-d->DataAddr)<=(1<<(10+LINE_BITS)))
      {
        tsaAccesses.push_back(LINE_NUMBER(i->DataAddr) );
      }
      all.push_back(LINE_NUMBER(i->DataAddr));
    }
    for( SSA::iterator i=buffer.ssa.begin() ; i!=buffer.ssa.end() ; i++ )
    {
      allSorted.push_back(LINE_NUMBER(i->DataAddr));
    }
    

    if(tsaL2Misses.size()>1)
    {
#if 1
      ;//cout<<"ADDR: "<<LINE_NUMBER(d->DataAddr)<<endl;
      if(&buffer==&g)
        ;//cout<<"GLOBAL ";
      else
        ;//cout<<"LOCAL ";
      ;//cout<<"REGION tsaL2Misses: "<<endl;
      PRINT_DELTAS(tsaL2Misses, LINE_NUMBER(d->DataAddr) );
      ;//cout<<endl;
#endif

    }
    else
    {
      if(&buffer==&g)
        ;//cout<<"GLOBAL L2 REGION too small"<<endl;
      else
        ;//cout<<"LOCAL L2 REGION too small"<<endl;
    }

    if(tsaAccesses.size()>1)
    {
#if 1
      ;//cout<<"ADDR: "<<LINE_NUMBER(d->DataAddr)<<endl;
      if(&buffer==&g)
        ;//cout<<"GLOBAL ";
      else
        ;//cout<<"LOCAL ";
      ;//cout<<"REGION tsaAccesses: "<<endl;
      PRINT_DELTAS(tsaAccesses, LINE_NUMBER(d->DataAddr) );
      ;//cout<<endl;
      
#endif
    }
    else
    {
      if(&buffer==&g)
        ;//cout<<"GLOBAL ACCESS REGION too small"<<endl;
      else
        ;//cout<<"LOCAL ACCESS REGION too small"<<endl;
    }
    if(all.size()>1)
    {
      if(&buffer==&g)
        ;//cout<<"GLOBAL ";
      else
        ;//cout<<"LOCAL ";
      ;//cout<<"FULL tsaAccesses: "<<endl;
      PRINT_DELTAS(all, LINE_NUMBER(d->DataAddr));
      ;//cout<<endl;
      ;//cout<<"FULL ssaAccesses: "<<endl;
      PRINT_DELTAS(allSorted, LINE_NUMBER(d->DataAddr));
      ;//cout<<endl;
    }
    else
    {
      if(&buffer==&g)
        ;//cout<<"GLOBAL ALL too small"<<endl;
      else
        ;//cout<<"LOCAL ALL too small"<<endl;
    }
  }
  void PrefetchGeoPattern(COUNTER cycle, MyPrefetchData_t * d, SLHBL & buffer)
  {
    AddressList al;
    buffer.PrintTSA();

    GET_DELTAS(buffer.tsa, al);
    PRINT_ABS(al);
    ;//cout<<endl;
    if(al.size()>3)
    {
      // Check if they follow a geometric pattern.
      CacheAddr_t r;
      int score=0;
      AddressList::reverse_iterator i=al.rbegin();
      AddressList::reverse_iterator j=i;
      j++;
      r=(*i)/(*j);
      while(j!=al.rend() && r*(*j)==(*i) )
      {
        if(r==(*i)/(*j) )
        {
          score++;
        }
        i++; j++;
      }
      if(score)
      {
///        ;//cout<<"Score is: "<<score<<" diff: "<<*(al.rbegin())<<" r: "<< r<< endl;
///        ;//cout<<"CURRENT: "<<LINE_NUMBER(d->DataAddr)<<endl;
///        ;//cout<<"DIFF: "<<LINE_NUMBER(*(al.rbegin())*r )<<endl;
///        ;//cout<<"ADDING: "<<LINE_NUMBER( d->DataAddr+*(al.rbegin())*r ) <<endl;
///        ;//cout<<"ADDING: "<<LINE_NUMBER( d->DataAddr+*(al.rbegin())*r*r ) <<endl;
        AddPrefetch(0, d->DataAddr+*(al.rbegin())*r, 50);
        AddPrefetch(0, d->DataAddr+*(al.rbegin())*r+*(al.rbegin())*r*r, 25);
///        exit(0);
      }
      else
      {
        // Try with a more fuzzy match.
        i=al.rbegin();
        j=i;
        j++;
        double dr=(double) (*i)/(*j);
        score=0;
        if(dr>1)
        {
          while(j!=al.rend())
          {
            double newdr;
            newdr=(double) (*i)/(*j);
            CacheAddr_t newdra=(CacheAddr_t)round(newdr);
///            PDEBUG(newdra);
///            PDEBUG(*i);
///            PDEBUG(*j);
            // FIXME: This should be LINE_SIZE*newdra
            if( labs(newdra*(*j)-(*i) ) < LINE_SIZE*2)
///            if(abs(newdr-dr)<0.02)
            {
              score++;
            }
            else
              break;
            i++;j++;
          }
          if(score>1)
          {
            // We have a fuzzy match.
            //
            ;//cout<<"FUZZY MATCH score: "<<score<<" raw: "<<dr<<" rounded: "<<round(dr)<<endl;
            CacheAddr_t dra=(CacheAddr_t)round(dr);
            AddPrefetch(0, d->DataAddr+ *(al.rbegin())*dra, 50);
            AddPrefetch(0, d->DataAddr+*(al.rbegin())*r+*(al.rbegin())*r*r, 25);
          }
        }
        // TODO: Add one for decreasing ones.
      }
    }
  }
  void PrefetchSSAPattern(COUNTER cycle, MyPrefetchData_t * d, SLHBL & buffer)
  {
    // Try to prefetch from 
    // Obtain a few samples in the left direction... right now hard-coded to 8
    if(buffer.ssa.size()<4)
      return;
    SSA::iterator i=buffer.ssa.begin();
    while(i->DataAddr!=d->DataAddr)
    {
      i++;
    }
    assert(i!=buffer.ssa.end());
    SSA::reverse_iterator j(i);
    AddressList left;
    AddressList right;
    j--;
    while(j!=buffer.ssa.rend() && labs(j->DataAddr-d->DataAddr)<=(1<<(10+LINE_BITS)) )
    {
      left.push_front(LINE_NUMBER(j->DataAddr));
      j++;
    }

    SSA::iterator k=i;
    while(k!=buffer.ssa.end() && labs(d->DataAddr-k->DataAddr)<=(1<<(10+LINE_BITS)) )
    {
      right.push_front(LINE_NUMBER(k->DataAddr));
      k++;
    }

    // TODO: TODO: TODO: Make sure our own address is part of the future accesses?
    // Should reduce number of prefetches technically.... not sure about PERF!
    if(left.size()>4)
    {
      ;//cout<<"**** LEFT"<<__FUNCTION__<<"****"<<endl;
      PRINT_DELTAS(left, LINE_NUMBER(d->DataAddr));
      ;//cout<<endl;
      map<int, int> strideMap;
      AddressList::iterator it=left.end();
      it--;
///      if(IS_L1(d->AccessType)==0)
      {
      FindStride(left, it, strideMap);
      AddStrides(strideMap, d, left, 0, 100);
      }
    }
    if(right.size()>4)
    {
      ;//cout<<"**** RIGHT"<<__FUNCTION__<<"****"<<endl;
      PRINT_DELTAS(left, LINE_NUMBER(d->DataAddr));
      ;//cout<<endl;
      map<int, int> strideMap;
      AddressList::iterator it=right.end();
      it--;
///      if(IS_L1(d->AccessType)==0)
      {
      FindStride(right, it, strideMap);
      AddStrides(strideMap, d, right, 0, 100);
      }
    }
  }
  void PrefetchPattern(COUNTER cycle, MyPrefetchData_t * d, SLHBL & buffer)
  {
    // Get raw addresses of L2 misses from the time-ordered buffer and see if there is any 
    // pattern in them.
    AddressList tsaL2Misses;
    AddressList tsaAccesses;
    AddressList all;
    AddressList allSorted;
    for( TSA::iterator i=buffer.tsa.begin() ; i!=buffer.tsa.end() ; i++ )
    {
///      ;//cout<<"Comparing: "<<LINE_NUMBER(i->DataAddr)<<" and "<<LINE_NUMBER(d->DataAddr);
      if( IS_L1(i->AccessType)==0 && labs(i->DataAddr-d->DataAddr)<= (1<<(10+LINE_BITS)) )
      {
///        ;//cout<<" MATCH"<<endl;
        tsaL2Misses.push_back(LINE_NUMBER(i->DataAddr) );
      }
      if( labs(i->DataAddr-d->DataAddr)<=(1<<(10+LINE_BITS)))
      {
        tsaAccesses.push_back(LINE_NUMBER(i->DataAddr) );
      }
      all.push_back(LINE_NUMBER(i->DataAddr));
    }
    for( SSA::iterator i=buffer.ssa.begin() ; i!=buffer.ssa.end() ; i++ )
    {
      allSorted.push_back(LINE_NUMBER(i->DataAddr));
    }

    if(tsaL2Misses.size()>1)
    {
#if 0
      ;//cout<<"ADDR: "<<LINE_NUMBER(d->DataAddr)<<endl;
      if(&buffer==&g)
        ;//cout<<"GLOBAL ";
      else
        ;//cout<<"LOCAL ";
      ;//cout<<"REGION tsaL2Misses: "<<endl;
      PRINT_DELTAS(tsaL2Misses, LINE_NUMBER(d->DataAddr) );
      ;//cout<<endl;
#endif

      map<int, int> strideMap;
      AddressList::iterator it=tsaL2Misses.end();
      it--;
///      if(IS_L1(d->AccessType)==0)
      {
      FindStride(tsaL2Misses, it, strideMap);
      AddStrides(strideMap, d, tsaL2Misses, 0, 100);
      }
    }

    if(tsaAccesses.size()>1)
    {
#if 0
      ;//cout<<"ADDR: "<<LINE_NUMBER(d->DataAddr)<<endl;
      if(&buffer==&g)
        ;//cout<<"GLOBAL ";
      else
        ;//cout<<"LOCAL ";
      ;//cout<<"REGION tsaAccesses: "<<endl;
      PRINT_DELTAS(tsaAccesses, LINE_NUMBER(d->DataAddr) );
      ;//cout<<endl;
      
      if(&buffer==&g)
        ;//cout<<"GLOBAL ";
      else
        ;//cout<<"LOCAL ";
      ;//cout<<"FULL tsaAccesses: "<<endl;
      PRINT_DELTAS(all, LINE_NUMBER(d->DataAddr));
      ;//cout<<endl;
      ;//cout<<"FULL ssaAccesses: "<<endl;
      PRINT_DELTAS(allSorted, LINE_NUMBER(d->DataAddr));
      ;//cout<<endl;
#endif

      map<int, int> strideMap;
      AddressList::iterator it=tsaAccesses.end();
      it--;
///      if(IS_L1(d->AccessType)==0)
      {
      FindStride(tsaAccesses, it, strideMap);
      AddStrides(strideMap, d, tsaAccesses, 0, 20);
      }
    }
  }
  void AddStrides(map<int, int> & strideMap, MyPrefetchData_t * d, AddressList & region, int type, int baseConfidence)
  {
    int maxStride=0;
    int maxStrideSteps=0;
    if(region.size()<2)
    {
///      ;//cout<<"Region size is less than 2"<<endl;
      return;
    }
///    PDEBUG(strideMap.size());
    for( map<int, int>::iterator i=strideMap.begin() ; i!=strideMap.end() ; i++ )
    {
      // Interpret it as preferring many smaller loops over one larger one.
///      PDEBUG(i->first);
///      PDEBUG(i->second);
      if(i->second>maxStride || (i->second==maxStride && i->first<maxStrideSteps) )
      {
        maxStride=i->second;
        maxStrideSteps=i->first;
      }
    }
    
    // added to take into account ssa...
#if 0
    for( AddressList::reverse_iterator i=region.rbegin() ; ; )
    {
      if(*i==LINE_NUMBER(d->DataAddr))
      {
        break;
      }
      i++;
      if(i==region.rend())
      {
        return;
      }
    }
#endif


///    ;//cout<<"maxStride: "<<maxStride<<endl;
    // This number has to be 1 for 429.mcf to be happy.
    if(maxStride<2)
      return;
///        >3 && (float)maxStride/maxStrideSteps > 0.8)
///    if(maxStride>3 && (float)maxStride/maxStrideSteps>0.8)
    {
      AddressList::iterator currentIt;
      CacheAddr_t a=LINE_NUMBER(d->DataAddr);
      AddressList strides;
      currentIt=region.end();
      currentIt--;
      ;//cout<<"maxStride: "<<maxStride<<" maxStrideSteps: "<<maxStrideSteps<<endl;
      GetN(strides, region, currentIt, -maxStrideSteps);

///      ;//cout<<"PRINTING STRIDES: "<<maxStride<<"/"<<maxStrideSteps<<endl;
///      PRINT_ABS(strides);
///      ;//cout<<endl;
      for( unsigned int i=0 ; i<4 ; i++ )
      {
        int confidence;
        a=a+(strides[i%strides.size()]);
        // TODO: Add base confidence.
        confidence=maxStride+maxStride/maxStrideSteps-i;
        AddPrefetch(type, a<<LINE_BITS, baseConfidence+confidence);
      }
    }
  }
  void AddPrefetch(int type, CacheAddr_t a, int confidence)
  {
    map<CacheAddr_t, int> & m=(type==0)?l2Prefetches:l1Prefetches;
    m[a]+=confidence;
    if(type==0)
      ;//cout<<"Added L2 prefetch: ";
    else
      ;//cout<<"Added L1 prefetch: ";
    ;//cout<<LINE_NUMBER(a)<<" "<<confidence<<endl;
  }

  void Print()
  {
    ;//cout<<"Printing GLOBAL:"<<endl;
    g.Print();
    ;//cout<<"Printing LOCAL:"<<endl;
    l.rbegin()->Print();
    BasePrefetcher::PrintStats();
  }
  void PrintStats()
  {
    BasePrefetcher::PrintStats();
    unsigned int GHBBits=ghl*32;
    unsigned int LHBBits=(1+lhl)*nlh*32;
    unsigned int FilterBits=l2Stats.max*32;
    unsigned int TotalBits=FilterBits+LHBBits+GHBBits;
    cout<<"NNLPrefetcher: ghl: "<<ghl<<" nlh: "<<nlh<<" lhl: "<<lhl<<endl;
    cout<<"Space consumed: by global: "<<GHBBits<<" bits"<<endl;
    cout<<"Space consumed: by local: "<<LHBBits<<" bits"<<endl;
    cout<<"Space consumed: by filter: "<<(FilterBits)<<" bits"<<endl;
    cout<<"Space consumed: by total: "<<(GHBBits+LHBBits+FilterBits)<<" bits"<<endl;
    cout<<"Btw, budget is: "<<32*1024<<" bits"<<endl;
    cout<<"Prefetcher consumes: "<<(float)(100.0f)*(TotalBits)/(32*1024)<<" \% of allowable budget"<<endl;
  }
  void Touch(list<SLHBL>::iterator i)
  {
    l.splice(l.end(), l, i);
    return;
  }
  bool AddLocal(MyPrefetchData_t * d, COUNTER cycle, list<SLHBL>::iterator & pos)
  {
    bool ret;
    list<SLHBL>::reverse_iterator i;
    list<SLHBL>::iterator it;
    for( i=l.rbegin() ; i!=l.rend() ; i++ )
    {
      if(i->PC==d->LastRequestAddr)
        break;
    }
    if(i==l.rend())
    {
      it=l.begin();
      it->Clear();
      it->PC=d->LastRequestAddr;
      misses++;
    }
    else
    {
      i++;
      it=i.base();
      hits++;
    }

    ret=it->Add(d);
    Touch(it);

    pos=it;
    return ret;
  }
};


#endif  /*__NNL_PREFETCHER_H__*/



#endif
