华艺学术文献数据库

题名	考通量理器中的行之工作群映射快取管理制
并列篇名	HETEROGENEITY-AWARE WORK-GROUP MAPPING AND CACHE MANAGEMENT FOR MODERN GPUS
作者	毛沙敏
关键词	SBM ； Kernels ； Resource usage ； SBM ； Kernels ； Resource usage
期刊名称	交通大學電機資訊國際學位學程學位論文
卷期/出版年月	2016年
学位类别	碩士
导师	賴伯承
内容语文	英文
主题分类	基礎與應用科學 > 資訊科學電機學院 > 電機資訊國際學位學程工程學 > 電機工程
参考文献	title = {Multi2Sim: A Simulation Framework for CPU-GPU Computing}, 連結： booktitle = {Proceedings of the 21st International Conference on Parallel Architectures and Compilation Techniques}, 連結： keywords = {CPU-GPU, heterogeneous computing, multi2sim, simulation}, 連結： author={M. Lee and J. h. Jeon and J. Kim and J. Song}, 連結： title={Scalable and Parallel Implementation of a Financial Application on a GPU: With Focus on Out-of-Core Case}, 連結： year={2010}, 連結： title={Porting a neuro-imaging application to a CPU-GPU cluster}, 連結： booktitle={Nuclear Science Symposium and Medical Imaging Conference (NSS/MIC), 2011 IEEE}, 連結： title={Implementing Geant4 on GPU for medical applications}, 連結： year={2011}, 連結： title={How far is the GPU technology from practical power system applications?}, 連結： title={The GAP project - GPU for realtime applications in high energy physics and medical imaging}, 連結： title={TAP: A TLP-aware cache management policy for a CPU-GPU heterogeneous architecture}, 連結： title={Runtime Support for Adaptive Spatial Partitioning and Inter-Kernel Communication on GPUs}, 連結： title = {Application-aware Memory System for Fair and Efficient Execution of Concurrent GPGPU Applications}, 連結： location = {Salt Lake City, UT, USA}, 連結： publisher = {ACM}, 連結： keywords = {CUDA Streams, GPGPUs, Memory System}, 連結： booktitle={Proceedings of the 22nd International Conference on Parallel Architectures and Compilation Techniques}, 連結： title={Coordinated static and dynamic cache bypassing for GPUs}, 連結： title={Adaptive Cache Bypassing for Inclusive Last Level Caches}, 連結： year={2013}, 連結： title={Run-time cache bypassing}, 連結： author={M. Kharbutli and Y. Solihin}, 連結： journal={IEEE Transactions on Computers}, 連結： title={Counter-Based Cache Replacement and Bypassing Algorithms}, 連結： booktitle={Microelectronics, 2004. ICM 2004 Proceedings. The 16th International Conference on}, 連結： title={Power consumption awareness in cache memory design with SystemC}, 連結： year={2004}, 連結： author = {Mark Harris}, 連結： title = {{ASYNCHRONOUS SHADERS: UNLOCKING THE FULL POTENTIAL OF THE GPU}}, 連結： year = {2015}, 連結： author = {Thomas Bradley}, 連結： title = {{AMD GRAPHICS CORES NEXT (GCN) ARCHITECTURE}}, 連結： year = {2012}, 連結： year = {2014}, 連結： title={An improved GPU MapReduce framework for data intensive applications}, 連結： title={Coordinating the use of GPU and CPU for improving performance of compute intensive applications}, 連結： @inproceedings{multi2sim, author = {Ubal, Rafael and Jang, Byunghyun and Mistry, Perhaad and Schaa, Dana and Kaeli, David}, series = {PACT '12}, isbn = {978-1-4503-1182-3}, location = {Minneapolis, Minnesota, USA}, pages = {335--344}, numpages = {10}, url = {http://doi.acm.org/10.1145/2370816.2370865}, doi = {10.1145/2370816.2370865}, acmid = {2370865}, @INPROCEEDINGS{financial_app, booktitle={Computer and Information Technology (CIT), 2010 IEEE 10th International Conference on}, pages={1323-1327}, keywords={computer graphic equipment;coprocessors;financial data processing;parallel programming;GPU;financial application;general purpose GPU computing;graphic processing unit;out-of-core case;parallel implementation;parallel programming;scalable implementation;Arrays;Computational modeling;Graphics processing unit;Instruction sets;Kernel;Monte Carlo methods;Random variables;GPU;High Performance Computing;Monte-Carlo simulation;Shared-Memor;out-of-core}, doi={10.1109/CIT.2010.238}, month={June},} @INPROCEEDINGS{neuro_imaging_app, author={R. S. Nakhjavani and S. Sharify and A. B. Hashemi and A. W. Lu and C. Amza and S. Strother}, booktitle={High Performance Computing Simulation (HPCS), 2014 International Conference on}, pages={137-145}, keywords={graphics processing units;medical image processing;neurophysiology;parallel processing;scheduling;CPU-GPU cluster;HPC clusters;MCT;NPAIRS;Sufferage scheduling algorithms;Torque;graphical processing units;heterogeneous cluster;linear algebra operations;neuroimaging application;nonexpert biomedical scientists;Eigenvalues and eigenfunctions;Graphics processing units;Java;Job shop scheduling;Libraries;Principal component analysis;Scheduling algorithms}, doi={10.1109/HPCSim.2014.6903679}, @INPROCEEDINGS{medical_imaging_app, author={H. Perez-Ponce and Z. El Bitar and Y. Boursier and D. Vintache and A. Bonissent and C. Morel and D. Brasse and D. Visvikis and J. Bert}, pages={2703-2707}, keywords={Monte Carlo methods;emission tomography;graphics processing units;medical computing;radiation therapy;GPU;Geant4 implementation;Monte Carlo simulation;computer clusters;dosimetry;emission tomography;graphics processing units;medical applications;medical imaging;photon physics;radiotherapy;Computational modeling;Graphics processing unit;Monte Carlo methods;Navigation;Photonics;Scattering}, doi={10.1109/NSSMIC.2011.6152953}, @INPROCEEDINGS{power_system_app, author={Z. Li and J. Zhu and F. Yang}, booktitle={2014 IEEE PES General Meeting pages={1-5}, keywords={graphics processing units;power system simulation;smart power grids;GPU technology;graphic processing unit;large scale power system;power system applications;smart grid technologies;Computational modeling;Graphics processing units;Load modeling;Optimization;Power grids;Runtime;Transportation;Graphic Process Unit (GPU);Smart Grid;affordable high performance computation platform;power system simulation and analysis}, doi={10.1109/PESGM.2014.6939132}, ISSN={1932-5517}, month={July},} @INPROCEEDINGS{high_energy_physics, author={R. Ammendola and M. Bauce and A. Biagioni and R. Fantechi and M. Fiorini and S. Giagu and E. Graverini and G. Lamanna and A. Lonardo and A. Messina and F. Pantaleo and R. Piandani and M. Rescigno and F. Simula and M. Sozzi and P. Vicini}, booktitle={2013 IEEE Nuclear Science Symposium and Medical Imaging Conference (2013 NSS/MIC)}, pages={1-7}, keywords={biomedical imaging;data acquisition;graphics processing units;high energy physics instrumentation computing;position sensitive particle detectors;software selection;ATLAS experiment;CERN;DAQ systems;GAP project;GPU applications;GPU latency steady reduction;GPU pilot project;NA62 experiment field test;commercial GPU parallel computing power;commercial multicore PC farms;general-purpose commodity systems;general-purpose computing;graphics processing units;hardware implementation;high energy physics data acquisition;high energy physics experiments;high level triggers;medical imaging;memory throughput;muon trigger;offline computation accelration;online parallel computing;online triggering applications;particle physics experiments;pure software selection system;realtime applications;realtime high energy physics applications;synchronous low level fixed-latency triggers;trigger development;trigger level reduction;trigger systems;very innovative approach;Data transfer;Graphics processing units;Hardware;Kernel;Protocols;Real-time systems;Standards}, doi={10.1109/NSSMIC.2013.6829757}, ISSN={1082-3654}, @INPROCEEDINGS{tap_tlp, author={J. Lee and H. Kim}, booktitle={IEEE International Symposium on High-Performance Comp Architecture}, pages={1-12}, keywords={cache storage;graphics processing units;CPU-GPU heterogeneous architecture;TAP;TAP-RRIP;TAP-UCP;TLP-aware cache management policy;core-sampling mechanism;dynamic cache partitioning;last-level cache management;promotion-based cache management;rereference interval prediction;several shared cache management mechanisms;shared resource management;thread-level parallelism;utility-based cache partitioning;Benchmark testing;Computer architecture;Graphics processing unit;Instruction sets;Measurement;Radiation detectors;System-on-a-chip}, doi={10.1109/HPCA.2012.6168947}, @INPROCEEDINGS{adaptiveSpatialPartitioning, author={Y. Ukidave and C. Kalra and D. Kaeli and P. Mistry and D. Schaa}, booktitle={Computer Architecture and High Performance Computing (SBAC-PAD), 2014 IEEE 26th International Symposium on}, pages={168-175}, keywords={graphics processing units;operating system kernels;scheduling;GPU;OpenCL runtime environment;adaptive spatial partitioning;dynamic spatial partitioning;interkernel communication;runtime support;scheduling mechanism;Benchmark testing;Graphics processing units;Kernel;Performance evaluation;Processor scheduling;Resource management;Runtime}, doi={10.1109/SBAC-PAD.2014.43}, ISSN={1550-6533}, month={Oct},} @inproceedings{applicationAwareMemory, author = {Jog, Adwait and Bolotin, Evgeny and Guz, Zvika and Parker, Mike and Keckler, Stephen W. and Kandemir, Mahmut T. and Das, Chita R.}, booktitle = {Proceedings of Workshop on General Purpose Processing Using GPUs}, series = {GPGPU-7}, isbn = {978-1-4503-2766-4}, pages = {1:1--1:8}, articleno = {1}, numpages = {8}, url = {http://doi.acm.org/10.1145/2576779.2576780}, doi = {10.1145/2576779.2576780}, acmid = {2576780}, address = {New York, NY, USA}, @INPROCEEDINGS{managing_shared_cache, author={V. Mekkat and A. Holey and P. C. Yew and A. Zhai}, title={Managing shared last-level cache in a heterogeneous multicore processor}, pages={225-234}, keywords={cache storage;graphics processing units;multi-threading;multiprocessing systems;resource allocation;CPU cores;GPU LLC access throttling;GPU application;GPU cores;GPU threads;GPU tolerance;HeLM;LLC misses;LRU policy;TAP-RRIP;cache management policies;cache sensitive CPU applications;data-parallel accelerators;heterogeneous LLC management;heterogeneous multicore processors;latency tolerance;memory access latency;on-chip resources sharing;shared LLC management policy;shared last-level cache management;thread-level parallelism;Benchmark testing;Graphics processing units;Instruction sets;Multicore processing;Runtime;Sensitivity;cache management policy;heterogeneous multicores;shared last-level cache}, doi={10.1109/PACT.2013.6618819}, ISSN={1089-795X}, @INPROCEEDINGS{coordinated_sd_cache_bypassing, author={X. Xie and Y. Liang and Y. Wang and G. Sun and T. Wang}, booktitle={2015 IEEE 21st International Symposium on High Performance Computer Architecture (HPCA)}, pages={76-88}, keywords={cache storage;graphics processing units;multi-threading;parallel architectures;CUDA programming model;GPUs;bypass preferences;cache resource contention problem;compile-time analysis;coordinated dynamic cache bypassing;coordinated static cache bypassing;dynamic bypassing technique;graphics processing units;memory divergence problems;on-chip memory;parallel architecture;run-time management;scratchpad memory;thread blocks;thread contention;Arrays;Graphics processing units;Instruction sets;Kernel;Pipelines;Synchronization;System-on-chip}, doi={10.1109/HPCA.2015.7056023}, ISSN={1530-0897}, month={Feb},} @INPROCEEDINGS{bypassing_buffer, author={S. Gupta and H. Gao and H. Zhou}, booktitle={Parallel Distributed Processing (IPDPS), 2013 IEEE 27th International Symposium on}, pages={1243-1253}, keywords={cache storage;cost reduction;memory architecture;performance evaluation;LLC performance enhancement;adaptive cache bypassing;bypass buffer;cache bypassing algorithm;cache hierarchy design;cache lines;cache replacement;hardware implementation cost reduction;high performance caches;inclusive last level caches;last level cache performance enhancement;upper level caches;usage information;Algorithm design and analysis;Art;Benchmark testing;Buffer storage;Coherence;Hardware;Resource management;Last level cache;cache bypassing;cache replacement policy;inclusion property}, doi={10.1109/IPDPS.2013.16}, ISSN={1530-2075}, month={May},} @ARTICLE{cpu_bypassing_one, author={T. L. Johnson and D. A. Connors and M. C. Merten and W. M. W. Hwu}, year={1999}, volume={48}, number={12}, pages={1338-1354}, keywords={cache storage;intelligent control;storage management;cache management;compiler techniques;instruction caches;integer programs;intelligent control;memory performance;microarchitecture scheme;run-time cache bypassing;trace-driven simulations;upper bounds;Delay;Hardware;Intelligent control;Memory management;Microarchitecture;Optimizing compilers;Program processors;Resource management;Runtime;Upper bound}, doi={10.1109/12.817393}, @ARTICLE{cpu_bypassing_two, volume={57}, number={4}, pages={433-447}, keywords={cache storage;content-addressable storage;access interval predictor;associative caches;bypassing algorithms;counter-based cache replacement;least recently used algorithm;live-time predictor;multilevel caches;theoretical optimal replacement algorithm;Algorithm design and analysis;Counting circuits;Filtering;Pollution;Prediction algorithms;Cache Bypassing;Cache Misses;Cache Replacement;Cache memories;Counter-Based Algorithms}, doi={10.1109/TC.2007.70816}, ISSN={0018-9340}, month={April},} @INPROCEEDINGS{using_cacti_one, author={S. Niar and S. Meftali and J. L. Dekeyser}, pages={244-247}, keywords={cache storage;electronic engineering computing;embedded systems;integrated circuit design;integrated memory circuits;power consumption;RTL;SystemC description;cache memory design;cache memory module;electronic engineering computing;embedded system simulation;power consumption analytical model;register transfer level;system-on-chip;Cache memory;Design methodology;Electronic packaging thermal management;Embedded system;Energy consumption;Hardware;Libraries;Mobile computing;Power system modeling;Thermal management}, doi={10.1109/ICM.2004.1434257}, month={Dec},} @misc{nvidia_streams, title = {{GPU Pro Tip: CUDA 7 Streams Simplify Concurrency}}, howpublished = "url{https://devblogs.nvidia.com/parallelforall/gpu-pro-tip-cuda-7-streams-simplify-concurrency/}", @misc{amd_ace, howpublished = "url{http://amd-dev.wpengine.netdna-cdn.com/wordpress/media/2012/10/Asynchronous-Shaders-White-Paper-FINAL.pdf}", @misc{hyperq, title = {{Hyper-Q Example}}, howpublished = "url{http://docs.nvidia.com/cuda/samples/6_Advanced/simpleHyperQ/doc/HyperQ.pdf}", @misc{amd_core_next, howpublished = "url{https://www.amd.com/Documents/GCN_Architecture_whitepaper.pdf}", @misc{amd_sdk, author = {AMD}, title = {{APP SDK – A Complete Development Platform}}, howpublished = "url{http://developer.amd.com/tools-and-sdks/opencl-zone/amd-accelerated-parallel-processing-app-sdk/}", @misc{cacti_tool, author = {HP Labs}, title = {{CACTI:An integrated cache and memory access time, cycle time, area, leakage, and dynamic power model}}, howpublished = "url{http://www.hpl.hp.com/research/cacti/}", year = {2008}, note = "[Online; accessed 20-July-2016]" } @INPROCEEDINGS{data_intensive_app, author={R. Nitu and E. Apostol and V. Cristea}, booktitle={Intelligent Computer Communication and Processing (ICCP), 2014 IEEE International Conference on}, pages={355-362}, keywords={data handling;graphics processing units;GPU MapReduce framework;GPU clusters;GPU programming;MapReduce paradigm;data intensive applications;distributed applications;intensive data processing;large scale computing;Graphics processing units;Kernel;Mars;Memory management;Parallel processing;Process control;Vectors;GPU MapReduce;Hadoop;OpenCL;shared memory}, doi={10.1109/ICCP.2014.6937021}, month={Sept},} @INPROCEEDINGS{compute_intensive_app, author={G. Teodoro and R. Sachetto and O. Sertel and M. N. Gurcan and W. Meira and U. Catalyurek and R. Ferreira}, booktitle={2009 IEEE International Conference on Cluster Computing and Workshops}, year={2009}, pages={1-10}, keywords={coprocessors;digital filters;medical image processing;microcomputers;tumours;Anthill runtime environment;compute intensive applications;distributed execution;dual-core machine;event-driven filters;graphics processing unit;histopathology application;image analysis techniques;multi-core CPUs;neuroblastoma prognosis;octa-core machine;parallel co-processors;standalone execution;Biomedical computing;Cancer;Computer applications;Concurrent computing;Image analysis;Microscopy;Neoplasms;Niobium;Performance analysis;Runtime environment}, doi={10.1109/CLUSTR.2009.5289193}, ISSN={1552-5244}, month={Aug},}