@inproceedings{cf493efab1e4435db94d4e1f8d164cc3,
title = "Hercules: Heterogeneity-Aware Inference Serving for At-Scale Personalized Recommendation",
abstract = "Personalized recommendation is an important class of deep-learning applications that powers a large collection of internet services and consumes a considerable amount of datacenter resources. As the scale of production-grade recommendation systems continues to grow, optimizing their serving performance and efficiency in a heterogeneous datacenter is important and can translate into infrastructure capacity saving. In this paper, we propose Hercules, an optimized framework for personalized recommendation inference serving that targets diverse industry-representative models and cloud-scale heterogeneous systems. Hercules performs a two-stage optimization procedure - offline profiling and online serving. The first stage searches the large under-explored task scheduling space with a gradient-based search algorithm achieving up to 9.0× latency-bounded throughput improvement on individual servers; it also identifies the optimal heterogeneous server architecture for each recommendation workload. The second stage performs heterogeneity-aware cluster provisioning to optimize resource mapping and allocation in response to fluctuating diurnal loads. The proposed cluster scheduler in Hercules achieves 47.7% cluster capacity saving and reduces the provisioned power by 23.7% over a state-of-the-art greedy scheduler.",
author = "Liu Ke and Udit Gupta and Mark Hempsteadis and Wu, {Carole Jean} and Lee, {Hsien Hsin S.} and Xuan Zhang",
note = "Funding Information: The authors would like to thank the anonymous reviewers for their valuable comments and suggestions. Liu Ke and Xuan Zhang were partially supported by NSF CCF-1942900. Publisher Copyright: {\textcopyright} 2022 IEEE.; 28th Annual IEEE International Symposium on High-Performance Computer Architecture, HPCA 2022 ; Conference date: 02-04-2022 Through 06-04-2022",
year = "2022",
doi = "10.1109/HPCA53966.2022.00019",
language = "English (US)",
series = "Proceedings - International Symposium on High-Performance Computer Architecture",
publisher = "IEEE Computer Society",
pages = "141--154",
booktitle = "Proceedings - 2022 IEEE International Symposium on High-Performance Computer Architecture, HPCA 2022",
}