changeset 52c552138ba1 in /z/repo/gem5
details: http://repo.gem5.org/gem5?cmd=changeset;node=52c552138ba1
description:
ext: add McPAT source
this patch adds the source for mcpat, a power, area, and timing modeling
framework.
diffstat:
ext/mcpat/ARM_A9.xml | 415 +++
ext/mcpat/ARM_A9_2000.xml | 463 ++++
ext/mcpat/ARM_A9_800.xml | 463 ++++
ext/mcpat/Alpha21364.xml | 456 +++
ext/mcpat/Niagara1.xml | 442 +++
ext/mcpat/Niagara1_sharing.xml | 400 +++
ext/mcpat/Niagara1_sharing_DC.xml | 442 +++
ext/mcpat/Niagara1_sharing_SBT.xml | 455 +++
ext/mcpat/Niagara1_sharing_ST.xml | 443 +++
ext/mcpat/Niagara2.xml | 438 +++
ext/mcpat/Penryn.xml | 456 +++
ext/mcpat/README | 226 +
ext/mcpat/XML_Parse.cc | 1798 +++++++++++++++
ext/mcpat/XML_Parse.h | 591 +++++
ext/mcpat/Xeon.xml | 455 +++
ext/mcpat/arch_const.h | 276 ++
ext/mcpat/array.cc | 302 ++
ext/mcpat/array.h | 101 +
ext/mcpat/basic_components.cc | 127 +
ext/mcpat/basic_components.h | 265 ++
ext/mcpat/cacti/README | 94 +
ext/mcpat/cacti/Ucache.cc | 916 +++++++
ext/mcpat/cacti/Ucache.h | 115 +
ext/mcpat/cacti/arbiter.cc | 130 +
ext/mcpat/cacti/arbiter.h | 79 +
ext/mcpat/cacti/area.cc | 47 +
ext/mcpat/cacti/area.h | 71 +
ext/mcpat/cacti/bank.cc | 198 +
ext/mcpat/cacti/bank.h | 69 +
ext/mcpat/cacti/basic_circuit.cc | 829 +++++++
ext/mcpat/cacti/basic_circuit.h | 248 ++
ext/mcpat/cacti/batch_tests | 41 +
ext/mcpat/cacti/cache.cfg | 175 +
ext/mcpat/cacti/cacti.i | 8 +
ext/mcpat/cacti/cacti.mk | 51 +
ext/mcpat/cacti/cacti_interface.cc | 173 +
ext/mcpat/cacti/cacti_interface.h | 633 +++++
ext/mcpat/cacti/component.cc | 236 ++
ext/mcpat/cacti/component.h | 84 +
ext/mcpat/cacti/const.h | 270 ++
ext/mcpat/cacti/contention.dat | 126 +
ext/mcpat/cacti/crossbar.cc | 161 +
ext/mcpat/cacti/crossbar.h | 85 +
ext/mcpat/cacti/decoder.cc | 1577 +++++++++++++
ext/mcpat/cacti/decoder.h | 247 ++
ext/mcpat/cacti/htree2.cc | 641 +++++
ext/mcpat/cacti/htree2.h | 97 +
ext/mcpat/cacti/io.cc | 2350 ++++++++++++++++++++
ext/mcpat/cacti/io.h | 44 +
ext/mcpat/cacti/main.cc | 191 +
ext/mcpat/cacti/makefile | 28 +
ext/mcpat/cacti/mat.cc | 1748 +++++++++++++++
ext/mcpat/cacti/mat.h | 148 +
ext/mcpat/cacti/nuca.cc | 612 +++++
ext/mcpat/cacti/nuca.h | 100 +
ext/mcpat/cacti/parameter.cc | 713 ++++++
ext/mcpat/cacti/parameter.h | 367 +++
ext/mcpat/cacti/router.cc | 311 ++
ext/mcpat/cacti/router.h | 115 +
ext/mcpat/cacti/subarray.cc | 196 +
ext/mcpat/cacti/subarray.h | 70 +
ext/mcpat/cacti/technology.cc | 2921 +++++++++++++++++++++++++
ext/mcpat/cacti/uca.cc | 426 +++
ext/mcpat/cacti/uca.h | 95 +
ext/mcpat/cacti/wire.cc | 832 +++++++
ext/mcpat/cacti/wire.h | 124 +
ext/mcpat/core.cc | 4135 ++++++++++++++++++++++++++++++++++++
ext/mcpat/core.h | 262 ++
ext/mcpat/globalvar.h | 48 +
ext/mcpat/interconnect.cc | 222 +
ext/mcpat/interconnect.h | 111 +
ext/mcpat/iocontrollers.cc | 446 +++
ext/mcpat/iocontrollers.h | 87 +
ext/mcpat/logic.cc | 1014 ++++++++
ext/mcpat/logic.h | 233 ++
ext/mcpat/main.cc | 101 +
ext/mcpat/makefile | 28 +
ext/mcpat/mcpat.mk | 81 +
ext/mcpat/mcpatXeonCore.mk | 81 +
ext/mcpat/memoryctrl.cc | 736 ++++++
ext/mcpat/memoryctrl.h | 113 +
ext/mcpat/noc.cc | 355 +++
ext/mcpat/noc.h | 75 +
ext/mcpat/processor.cc | 839 +++++++
ext/mcpat/processor.h | 79 +
ext/mcpat/results/A9_2000 | 321 ++
ext/mcpat/results/A9_2000_withIOC | 410 +++
ext/mcpat/results/A9_800 | 320 ++
ext/mcpat/results/Alpha21364 | 441 +++
ext/mcpat/results/Alpha21364_90nm | 408 +++
ext/mcpat/results/Penryn | 315 ++
ext/mcpat/results/T1 | 296 ++
ext/mcpat/results/T1_DC_64 | 270 ++
ext/mcpat/results/T1_SBT_64 | 252 ++
ext/mcpat/results/T1_ST_64 | 270 ++
ext/mcpat/results/T2 | 321 ++
ext/mcpat/results/Xeon_core | 341 ++
ext/mcpat/results/Xeon_uncore | 341 ++
ext/mcpat/sharedcache.cc | 1162 ++++++++++
ext/mcpat/sharedcache.h | 89 +
ext/mcpat/technology_xeon_core.cc | 2772 ++++++++++++++++++++++++
ext/mcpat/version.h | 40 +
ext/mcpat/xmlParser.cc | 2891 +++++++++++++++++++++++++
ext/mcpat/xmlParser.h | 764 ++++++
104 files changed, 48876 insertions(+), 0 deletions(-)
diffs (truncated from 49294 to 300 lines):
diff -r 2434d2fa50b6 -r 52c552138ba1 ext/mcpat/ARM_A9.xml
--- /dev/null Thu Jan 01 00:00:00 1970 +0000
+++ b/ext/mcpat/ARM_A9.xml Tue Apr 01 12:44:30 2014 -0400
@@ -0,0 +1,415 @@
+<?xml version="1.0" ?>
+<component id="root" name="root">
+ <component id="system" name="system">
+ <!--McPAT will skip the components if number is set to 0 -->
+ <param name="number_of_cores" value="2"/>
+ <param name="number_of_L1Directories" value="2"/>
+ <param name="number_of_L2Directories" value="0"/>
+ <param name="number_of_L2s" value="0"/> <!-- This number means
how many L2 clusters in each cluster there can be multiple banks/ports -->
+ <param name="Private_L2" value="0"/><!--1 Private, 0
shared/coherent -->
+ <param name="number_of_L3s" value="0"/> <!-- This number means
how many L3 clusters -->
+ <param name="number_of_NoCs" value="1"/>
+ <param name="homogeneous_cores" value="1"/><!--1 means homo -->
+ <param name="homogeneous_L2s" value="1"/>
+ <param name="homogeneous_L1Directorys" value="1"/>
+ <param name="homogeneous_L2Directorys" value="1"/>
+ <param name="homogeneous_L3s" value="1"/>
+ <param name="homogeneous_ccs" value="1"/><!--cache coherece
hardware -->
+ <param name="homogeneous_NoCs" value="1"/>
+ <param name="core_tech_node" value="40"/><!-- nm -->
+ <param name="target_core_clockrate" value="2000"/><!--MHz -->
+ <param name="temperature" value="380"/> <!-- Kelvin -->
+ <param name="number_cache_levels" value="2"/>
+ <param name="interconnect_projection_type" value="1"/><!--0:
agressive wire technology; 1: conservative wire technology -->
+ <param name="device_type" value="1"/><!--0: HP(High Performance
Type); 1: LSTP(Low standby power) 2: LOP (Low Operating Power) -->
+ <param name="longer_channel_device" value="1"/><!-- 0 no use; 1
use when approperiate -->
+ <param name="Embedded" value="1"/><!-- Embedded processor like
ARM or general purpose processors? -->
+ <param name="machine_bits" value="32"/>
+ <param name="virtual_address_width" value="32"/>
+ <param name="physical_address_width" value="32"/>
+ <param name="virtual_memory_page_size" value="4096"/>
+ <!-- address width determins the tag_width in Cache, LSQ and
buffers in cache controller
+ default value is machine_bits, if not set -->
+ <stat name="total_cycles" value="100000"/>
+ <stat name="idle_cycles" value="0"/>
+ <stat name="busy_cycles" value="100000"/>
+ <!--This page size(B) is complete different from the
page size in Main memo secction. this page size is the size of
+ virtual memory from OS/Archi perspective; the page size
in Main memo secction is the actuall physical line in a DRAM bank -->
+ <!-- *********************** cores ******************* -->
+ <component id="system.core0" name="core0">
+ <!-- Core property -->
+ <param name="clock_rate" value="2000"/>
+ <!-- for cores with unknow timing, set to 0 to force
off the opt flag -->
+ <param name="opt_local" value="1"/>
+ <param name="instruction_length" value="32"/>
+ <param name="opcode_width" value="7"/>
+ <param name="x86" value="0"/>
+ <param name="micro_opcode_width" value="8"/>
+ <param name="machine_type" value="0"/>
+ <!-- inorder/OoO; 1 inorder; 0 OOO-->
+ <param name="number_hardware_threads" value="1"/>
+ <!-- number_instruction_fetch_ports(icache ports) is
always 1 in single-thread processor,
+ it only may be more than one in SMT processors. BTB
ports always equals to fetch ports since
+ branch information in consective branch instructions in
the same fetch group can be read out from BTB once.-->
+ <param name="fetch_width" value="2"/>
+ <!-- fetch_width determins the size of cachelines of L1
cache block -->
+ <param name="number_instruction_fetch_ports" value="1"/>
+ <param name="decode_width" value="2"/>
+ <!-- decode_width determins the number of ports of the
+ renaming table (both RAM and CAM) scheme -->
+ <param name="issue_width" value="4"/>
+ <param name="peak_issue_width" value="7"/>
+ <!-- issue_width determins the number of ports of Issue
window and other logic
+ as in the complexity effective proccessors paper;
issue_width==dispatch_width -->
+ <param name="commit_width" value="4"/>
+ <!-- commit_width determins the number of ports of
register files -->
+ <param name="fp_issue_width" value="1"/>
+ <param name="prediction_width" value="1"/>
+ <!-- number of branch instructions can be predicted
simultannouesl-->
+ <!-- Current version of McPAT does not distinguish int
and floating point pipelines
+ Theses parameters are reserved for future use.-->
+ <param name="pipelines_per_core" value="1,1"/>
+ <!--integer_pipeline and floating_pipelines, if the
floating_pipelines is 0, then the pipeline is shared-->
+ <param name="pipeline_depth" value="8,8"/>
+ <!-- pipeline depth of int and fp, if pipeline is
shared, the second number is the average cycles of fp ops -->
+ <!-- issue and exe unit-->
+ <param name="ALU_per_core" value="3"/>
+ <!-- contains an adder, a shifter, and a logical unit
-->
+ <param name="MUL_per_core" value="1"/>
+ <!-- For MUL and Div -->
+ <param name="FPU_per_core" value="1"/>
+ <!-- buffer between IF and ID stage -->
+ <param name="instruction_buffer_size" value="32"/>
+ <!-- buffer between ID and sche/exe stage -->
+ <param name="decoded_stream_buffer_size" value="16"/>
+ <param name="instruction_window_scheme" value="0"/><!--
0 PHYREG based, 1 RSBASED-->
+ <!-- McPAT support 2 types of OoO cores, RS based and
physical reg based-->
+ <param name="instruction_window_size" value="20"/>
+ <param name="fp_instruction_window_size" value="15"/>
+ <!-- Numbers need to be confirmed -->
+ <!-- the instruction issue Q as in Alpha 21264; The RS
as in Intel P6 -->
+ <param name="ROB_size" value="0"/>
+ <!-- each in-flight instruction has an entry in ROB -->
+ <!-- registers -->
+ <param name="archi_Regs_IRF_size" value="32"/>
+ <param name="archi_Regs_FRF_size" value="32"/>
+ <!-- if OoO processor, phy_reg number is needed for
renaming logic,
+ renaming logic is for both integer and floating point
insts. -->
+ <param name="phy_Regs_IRF_size" value="64"/>
+ <param name="phy_Regs_FRF_size" value="64"/>
+ <!-- rename logic -->
+ <param name="rename_scheme" value="0"/>
+ <!-- can be RAM based(0) or CAM based(1) rename scheme
+ RAM-based scheme will have free list, status table;
+ CAM-based scheme have the valid bit in the data field
of the CAM
+ both RAM and CAM need RAM-based checkpoint table,
checkpoint_depth=# of in_flight instructions;
+ Detailed RAT Implementation see TR -->
+ <param name="register_windows_size" value="0"/>
+ <!-- how many windows in the windowed register file,
sun processors;
+ no register windowing is used when this number is 0 -->
+ <!-- In OoO cores, loads and stores can be issued
whether inorder(Pentium Pro) or (OoO)out-of-order(Alpha),
+ They will always try to exeute out-of-order though. -->
+ <param name="LSU_order" value="inorder"/>
+ <param name="store_buffer_size" value="4"/>
+ <!-- By default, in-order cores do not have load
buffers -->
+ <param name="load_buffer_size" value="0"/>
+ <!-- number of ports refer to sustainable concurrent
memory accesses -->
+ <param name="memory_ports" value="1"/>
+ <!-- max_allowed_in_flight_memo_instructions determins
the # of ports of load and store buffer
+ as well as the ports of Dcache which is connected to
LSU -->
+ <!-- dual-pumped Dcache can be used to save the extra
read/write ports -->
+ <param name="RAS_size" value="32"/>
+ <!-- general stats, defines simulation periods;require
total, idle, and busy cycles for senity check -->
+ <!-- please note: if target architecture is X86, then
all the instrucions refer to (fused) micro-ops -->
+ <stat name="total_instructions" value="400000"/>
+ <stat name="int_instructions" value="200000"/>
+ <stat name="fp_instructions" value="100000"/>
+ <stat name="branch_instructions" value="100000"/>
+ <stat name="branch_mispredictions" value="0"/>
+ <stat name="load_instructions" value="0"/>
+ <stat name="store_instructions" value="50000"/>
+ <stat name="committed_instructions" value="400000"/>
+ <stat name="committed_int_instructions" value="200000"/>
+ <stat name="committed_fp_instructions" value="100000"/>
+ <stat name="pipeline_duty_cycle" value="1"/><!--<=1,
runtime_ipc/peak_ipc; averaged for all cores if homogenous -->
+ <!-- the following cycle stats are used for
heterogeneouse cores only,
+ please ignore them if homogeneouse cores -->
+ <stat name="total_cycles" value="100000"/>
+ <stat name="idle_cycles" value="0"/>
+ <stat name="busy_cycles" value="100000"/>
+ <!-- instruction buffer stats -->
+ <!-- ROB stats, both RS and Phy based OoOs have ROB
+ performance simulator should capture the difference on
accesses,
+ otherwise, McPAT has to guess based on number of
commited instructions. -->
+ <stat name="ROB_reads" value="400000"/>
+ <stat name="ROB_writes" value="400000"/>
+ <!-- RAT accesses -->
+ <stat name="rename_reads" value="800000"/> <!--lookup
in renaming logic -->
+ <stat name="rename_writes" value="400000"/><!--update
dest regs. renaming logic -->
+ <stat name="fp_rename_reads" value="200000"/>
+ <stat name="fp_rename_writes" value="100000"/>
+ <!-- decode and rename stage use this, should be total
ic - nop -->
+ <!-- Inst window stats -->
+ <stat name="inst_window_reads" value="400000"/>
+ <stat name="inst_window_writes" value="400000"/>
+ <stat name="inst_window_wakeup_accesses"
value="800000"/>
+ <stat name="fp_inst_window_reads" value="200000"/>
+ <stat name="fp_inst_window_writes" value="200000"/>
+ <stat name="fp_inst_window_wakeup_accesses"
value="400000"/>
+ <!-- RF accesses -->
+ <stat name="int_regfile_reads" value="600000"/>
+ <stat name="float_regfile_reads" value="100000"/>
+ <stat name="int_regfile_writes" value="300000"/>
+ <stat name="float_regfile_writes" value="50000"/>
+ <!-- accesses to the working reg -->
+ <stat name="function_calls" value="5"/>
+ <stat name="context_switches" value="260343"/>
+ <!-- Number of Windowes switches (number of function
calls and returns)-->
+ <!-- Alu stats by default, the processor has one FPU
that includes the divider and
+ multiplier. The fpu accesses should include accesses
to multiplier and divider -->
+ <stat name="ialu_accesses" value="300000"/>
+ <stat name="fpu_accesses" value="100000"/>
+ <stat name="mul_accesses" value="200000"/>
+ <stat name="cdb_alu_accesses" value="300000"/>
+ <stat name="cdb_mul_accesses" value="200000"/>
+ <stat name="cdb_fpu_accesses" value="100000"/>
+ <!-- multiple cycle accesses should be counted multiple
times,
+ otherwise, McPAT can use internal counter for different
floating point instructions
+ to get final accesses. But that needs detailed info for
floating point inst mix -->
+ <!-- currently the performance simulator should
+ make sure all the numbers are final numbers,
+ including the explicit read/write accesses,
+ and the implicite accesses such as replacements and etc.
+ Future versions of McPAT may be able to reason the
implicite access
+ based on param and stats of last level cache
+ The same rule applies to all cache access stats too!
-->
+ <!-- following is AF for max power computation.
+ Do not change them, unless you understand
them-->
+ <stat name="IFU_duty_cycle" value="1"/>
+ <stat name="LSU_duty_cycle" value="0.5"/>
+ <stat name="MemManU_I_duty_cycle" value="1"/>
+ <stat name="MemManU_D_duty_cycle" value="0.5"/>
+ <stat name="ALU_duty_cycle" value="1"/>
+ <stat name="MUL_duty_cycle" value="0.3"/>
+ <stat name="FPU_duty_cycle" value="0.3"/>
+ <stat name="ALU_cdb_duty_cycle" value="1"/>
+ <stat name="MUL_cdb_duty_cycle" value="0.3"/>
+ <stat name="FPU_cdb_duty_cycle" value="0.3"/>
+ <param name="number_of_BPT" value="2"/>
+ <component id="system.core0.predictor" name="PBT">
+ <!-- branch predictor; tournament predictor see
Alpha implementation -->
+ <param name="local_predictor_size"
value="10,3"/>
+ <param name="local_predictor_entries"
value="1024"/>
+ <param name="global_predictor_entries"
value="4096"/>
+ <param name="global_predictor_bits" value="2"/>
+ <param name="chooser_predictor_entries"
value="4096"/>
+ <param name="chooser_predictor_bits" value="2"/>
+ <!-- These parameters can be combined like
below in next version
+ <param name="load_predictor" value="10,3,1024"/>
+ <param name="global_predictor" value="4096,2"/>
+ <param name="predictor_chooser" value="4096,2"/>
+ -->
+ </component>
+ <component id="system.core0.itlb" name="itlb">
+ <param name="number_entries" value="64"/>
+ <stat name="total_accesses" value="200000"/>
+ <stat name="total_misses" value="4"/>
+ <stat name="conflicts" value="0"/>
+ <!-- there is no write requests to itlb
although writes happen to itlb after miss,
+ which is actually a replacement -->
+ </component>
+ <component id="system.core0.icache" name="icache">
+ <!-- there is no write requests to itlb
although writes happen to it after miss,
+ which is actually a replacement -->
+ <param name="icache_config"
value="32768,8,4,1,10,10,32,0"/>
+ <!-- the parameters are capacity,block_width,
associativity, bank, throughput w.r.t. core clock, latency w.r.t. core
clock,output_width, cache policy, -->
+ <!-- cache_policy;//0 no write or write-though
with non-write allocate;1 write-back with write-allocate -->
+ <param name="buffer_sizes" value="4, 4, 4,0"/>
+ <!-- cache controller buffer sizes:
miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
+ <stat name="read_accesses" value="200000"/>
+ <stat name="read_misses" value="0"/>
+ <stat name="conflicts" value="0"/>
+ </component>
+ <component id="system.core0.dtlb" name="dtlb">
+ <param name="number_entries"
value="64"/><!--dual threads-->
+ <stat name="total_accesses" value="400000"/>
+ <stat name="total_misses" value="4"/>
+ <stat name="conflicts" value="0"/>
+ </component>
+ <component id="system.core0.dcache" name="dcache">
+ <!-- all the buffer related are optional -->
+ <param name="dcache_config" value="32768,8,4,1,
10,10, 32,1 "/>
+ <param name="buffer_sizes" value="4, 4, 4, 4"/>
+ <!-- cache controller buffer sizes:
miss_buffer_size(MSHR),fill_buffer_size,prefetch_buffer_size,wb_buffer_size-->
+ <stat name="read_accesses" value="800000"/>
+ <stat name="write_accesses" value="27276"/>
+ <stat name="read_misses" value="1632"/>
+ <stat name="write_misses" value="183"/>
+ <stat name="conflicts" value="0"/>
+ </component>
+ <param name="number_of_BTB" value="2"/>
+ <component id="system.core0.BTB" name="BTB">
+ <!-- all the buffer related are optional -->
+ <param name="BTB_config" value="2048,4,2, 2,
1,3"/> <!--should be 4096 + 1024 -->
+ <!-- the parameters are
capacity,block_width,associativity,bank, throughput w.r.t. core clock, latency
w.r.t. core clock,-->
+ <stat name="read_accesses" value="400000"/>
<!--See IFU code for guideline -->
+ <stat name="write_accesses" value="0"/>
+ </component>
+ </component>
+ <component id="system.L1Directory0" name="L1Directory0">
+ <param name="Directory_type" value="0"/>
+ <!--0 cam based shadowed tag. 1 directory cache -->
+ <param name="Dir_config" value="2048,1,0,1, 4,
4, 8"/>
+ <!-- the parameters are capacity,block_width,
associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+ <param name="buffer_sizes" value="8, 8, 8, 8"/>
+ <!-- all the buffer related are optional -->
+ <param name="clockrate" value="2000"/>
+ <param name="ports" value="1,1,1"/>
+ <!-- number of r, w, and rw search ports -->
+ <param name="device_type" value="0"/>
+ <!-- altough there are multiple access types,
+ Performance simulator needs to cast them into
reads or writes
+ e.g. the invalidates can be considered as
writes -->
+ <stat name="read_accesses" value="800000"/>
+ <stat name="write_accesses" value="27276"/>
+ <stat name="read_misses" value="1632"/>
+ <stat name="write_misses" value="183"/>
+ <stat name="conflicts" value="20"/>
+ <stat name="duty_cycle" value="0.1"/>
+ </component>
+ <component id="system.L2Directory0" name="L2Directory0">
+ <param name="Directory_type" value="1"/>
+ <!--0 cam based shadowed tag. 1 directory cache -->
+ <param name="Dir_config"
value="1048576,16,16,1,2, 100"/>
+ <!-- the parameters are capacity,block_width,
associativity,bank, throughput w.r.t. core clock, latency w.r.t. core clock,-->
+ <param name="buffer_sizes" value="8, 8, 8, 8"/>
+ <!-- all the buffer related are optional -->
+ <param name="clockrate" value="2000"/>
+ <param name="ports" value="1,1,1"/>
+ <!-- number of r, w, and rw search ports -->
+ <param name="device_type" value="0"/>
+ <!-- altough there are multiple access types,
+ Performance simulator needs to cast them into
reads or writes
+ e.g. the invalidates can be considered as
writes -->
+ <stat name="read_accesses" value="58824"/>
+ <stat name="write_accesses" value="27276"/>
+ <stat name="read_misses" value="1632"/>
_______________________________________________
gem5-dev mailing list
[email protected]
http://m5sim.org/mailman/listinfo/gem5-dev