@INPROCEEDINGS{10609641, author={Zhao, Yilong and Gao, Mingyu and Liu, Fangxin and Hu, Yiwei and Wang, Zongwu and Lin, Han and Li, Ji and Xian, He and Dong, Hanlin and Yang, Tao and Jing, Naifeng and Liang, Xiaoyao and Jiang, Li}, booktitle={2024 ACM/IEEE 51st Annual International Symposium on Computer Architecture (ISCA)}, title={UM-PIM: DRAM-based PIM with Uniform & Shared Memory Space}, year={2024}, volume={}, number={}, pages={644-659}, abstract={DRAM-based Processing in Memory (PIM) addresses the “memory wall” problem by incorporating computing units (PIM units) into main memory devices for faster and wider local data access. However, critical challenges prevent PIM units from being compatible with existing CPU hosts. Memory interleaving and virtual memory limit the size of contiguous data visible to PIM units that constrains the granularity of PIM tasks. Fine-grained PIM tasks result in significant CPU-PIM offloading overhead, offsetting the speed-up of PIM. Existing PIM systems adopt drastic measures to ensure PIM task offloading efficiency, including isolating PIM memory space and turning off global memory interleaving. These interventions, however, decrease the CPU’s memory bandwidth and introduce extra data transfer, leading to an additional “system memory wall”. This new “wall” must be eliminated before fully embracing the PIM technology. In this work, we propose UM-PIM, a PIM system with interleaved CPU pages and non-interleaved PIM pages coexisting in a Uniform and Shared Memory space. UM-PIM enables zero-copy during PIM task offloading and maintains the CPU’s memory bandwidth while ensuring PIM offloading efficiency. Firstly, we propose a dual-track memory management mechanism consisting of independent page allocation and address translation for the two kinds of pages, respectively. Second, we design UM-PIM interface hardware on the DIMM (with PIMs) side to provide a dynamic address mapping for accelerating the data re-layout. Finally, we provide APIs to reduce PIM-to-PIM communication overhead by optimizing the CPU’s access to PIM pages in different communication modes. We compare UM-PIM with a CPU system and the current PIM systems. Results show negligible performance degradation for CPU workloads ($\lt 0.1 \%$) on UM-PIM, contrasting with the $25.8 \%$ degradation on the current PIM system with memory interleaving switched off. For PIM workloads partitioned to CPU and PIM units, UM-PIM can reduce the CPU time by $4.93 \times$, resulting in an end-to-end $1.96 \times$ speedup on average.}, keywords={Degradation;Memory management;Memory architecture;Layout;Bandwidth;Switches;Data transfer;Processing in Memory (PIM);DRAM;Address Mapping;Data Re-layout}, doi={10.1109/ISCA59077.2024.00053}, ISSN={}, month={June},}