@INPROCEEDINGS{11240991,
  author={Hu, Yiwei and Liu, Fangxin and Wang, Zongwu and Zhao, Yilong and Yang, Tao and Jiang, Li and Guan, Haibing},
  booktitle={2025 IEEE/ACM International Conference On Computer Aided Design (ICCAD)}, 
  title={PLAIN: Leveraging High Internal Bandwidth in PIM for Accelerating Large Language Model Inference via Mixed-Precision Quantization}, 
  year={2025},
  volume={},
  number={},
  pages={1-9},
  abstract={DRAM-based processing-in-memory (DRAM-PIM) has gained commercial prominence in recent years. However, its integration for deep learning acceleration, particularly for large language models (LLMs), poses inherent challenges. Existing DRAM-PIM systems are limited in computational capabilities, primarily supporting element-wise and general matrix-vector multiplication (GEMV) operations, which contribute only a small portion of the execution time in LLM workloads. As a result, current systems still require powerful host processors to manage compute-heavy operations.To address these challenges and expand the applicability of commodity DRAM-PIMs in accelerating LLMs, we introduce PLAIN, a novel software/hardware co-design framework for PIM-enabled systems. PLAIN leverages the distribution locality of parameters and the unique characteristics of PIM to achieve optimal trade-offs between inference cost and model quality. Our framework includes three key innovations: 1) firstly, we propose a novel quantization algorithm that determines the optimal precision of parameters within each layer, considering both algorithmic and hardware characteristics to optimize hardware mapping; 2) PLAIN strategically utilizes both GPUs and PIMs, leveraging the high internal memory bandwidth within HBM for attention layers and the powerful compute capability of conventional systems for fully connected (FC) layers; 3) PLAIN integrates a workload-aware dataflow scheduler that efficiently arranges complex computations and memory access for mixed-precision tensors, optimizing execution across different hardware components. Experiments show PLAIN outperforms the conventional GPU with the same memory parameters and the state-of-the-art PIM accelerator, achieving a 5.03× and 1.69× performance boost, with negligible model quality loss.},
  keywords={Technological innovation;Schedules;Quantization (signal);Tensors;Large language models;Computational modeling;Memory management;Bandwidth;Inference algorithms;Hardware},
  doi={10.1109/ICCAD66269.2025.11240991},
  ISSN={1558-2434},
  month={Oct},}