{"id":943,"date":"2025-11-24T19:39:34","date_gmt":"2025-11-25T03:39:34","guid":{"rendered":"https:\/\/identia.digital\/lmcache\/?p=943"},"modified":"2025-11-24T19:39:35","modified_gmt":"2025-11-25T03:39:35","slug":"lmcache%e9%9d%a2%e5%90%91%e4%bc%81%e4%b8%9a%e7%ba%a7%e5%a4%a7%e8%af%ad%e8%a8%80%e6%a8%a1%e5%9e%8b%e6%8e%a8%e7%90%86%e7%9a%84%e9%ab%98%e6%95%88kv-cache%e5%b1%82","status":"publish","type":"post","link":"https:\/\/identia.digital\/lmcache\/en\/2025\/11\/24\/lmcache%e9%9d%a2%e5%90%91%e4%bc%81%e4%b8%9a%e7%ba%a7%e5%a4%a7%e8%af%ad%e8%a8%80%e6%a8%a1%e5%9e%8b%e6%8e%a8%e7%90%86%e7%9a%84%e9%ab%98%e6%95%88kv-cache%e5%b1%82\/","title":{"rendered":"LMCACHE????????????????KV Cache?"},"content":{"rendered":"\n<p class=\"wp-block-paragraph\">???Yihua Cheng ?Yuhan Liu ? Jiayi Yao * ?Yuwei An?Xiaokun Chen?Shaoting Feng ? Yuyang Huang?Samuel Shen?Kuntai Du?Junchen Jiang<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">???<strong>TensorMesh<\/strong>&amp;<strong>?????<\/strong><\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"??\">??<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">?????????LLM???????????????????????????????????????????????????????????KV Cache???????????????????????????? GPU ??????????????? ?????????????KV Cache?????????? LMCACHE???????????? KV Cache????????????????? LLM ?????vLLM ? SGLang???? KV Cache??????????????LMCACHE ? LLM ??????? KV Cache???????? LLM ??????token??????? KV Cache?????????????????????????????????????????????PD??????????????LMCACHE ???????????????????1?????? KV Cache????????????????????? I\/O ??????????2????? KV Cache??????? LMCACHE ??????????????3?????? API??????????????????????? GPU?CPU?????????????????????????LMCACHE ? vLLM ?????????????????????????????? 15 ???????????LMCACHE ???????????????? KV Cache??????????????????????<a href=\"https:\/\/github.com\/LMCache\/LMCache%E3%80%82\">https:\/\/github.com\/LMCache\/LMCache?<\/a><\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"1-??\">1. ??<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">?????????????????????LLM ???????????????????????????????????????????????LLM ????????????\u2014\u2014?????????????????????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">?? LLM ???????????????????????????????????????????????????????? LLM ????????????? I\/O ?????????????? GPU ? GPU ????????????token????????????token??????????????????????????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">????????????????????????????????????????????????????<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img fetchpriority=\"high\" decoding=\"async\" width=\"1024\" height=\"329\" src=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031100044309-1024x329.png\" alt=\"????? LMCACHE ?????????????????? CPU \/ ?????????? KV ????????????????? KV ???????\" class=\"wp-image-947\" srcset=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031100044309-1024x329.png 1024w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031100044309-300x96.png 300w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031100044309-768x247.png 768w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031100044309-1200x386.png 1200w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031100044309.png 1312w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>?????????????<\/strong>?????????????????????????????LLM ???????Prefill??????Prefill?????????????? \u2014\u2014 ????????????????????token???????Prefill? LLM ??????? KV Cache?????????????????????????????????????????????????????????? KV Cache??????????????????????????????????Prefill????token?????TTFT???? GPU ?????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>PD???????????<\/strong>?Prefill????????????????decode????????????????????????????????????????decode???????prefill?????GPU ?????????????????????prefill????????decode??????????PD??????????prefill???????????????????? KV Cache??????????decode???????????????????decode?????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">?????????LLM ?????????? KV Cache?????????????????????????????? KV Cache????? KV Cache???????????????????? KV Cache??????????????????????????KV ?????????????????????????????????????? vLLM ? SGLang??????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">???? LMCACHE\u2014\u2014 ???????????? KV Cache????????????? LMCACHE?KV Cache???????????????????????????????????CPU ????????????? Redis???????????????RDMA?NVLink????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ??????????<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li><strong>???????<\/strong>?????????????? KV Cache????????????????????????????? KV Cache????????????? GPU ??????? \/ ???????????????????????? KV Cache??????????????????????????? \/ ?? KV ???????????? GPU ?????????????????????? KV Cache?????????????????<\/li>\n\n\n\n<li><strong>??????????<\/strong>?????????????????????????????2025 ?????? 15-20 ???????????????????????????? LLM ????????????????? GPU ???? KV ????????? LMCACHE ??????LMCACHE ????????? KV ????????? LMCACHE ????????????????????????? API?<\/li>\n\n\n\n<li><strong>??? KV Cache????<\/strong>?LMCACHE ???????? KV Cache?LLM ????????????????????????????????????????? KV Cache? API????? API ??????????????????????????? KV Cache???????<\/li>\n<\/ol>\n\n\n\n<p class=\"wp-block-paragraph\">???????LMCACHE ??????????????????????? PD ???????????????????? KV Cache????????? API????????15 ???????? 2 ???????????LMCACHE ?????????????????? KV ???????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">????????????????? 2 ???LMCACHE ??????????? 3-6 ????????? 8 ????????? 9 ???<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"2-??\">2. ??<\/h2>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"21-???????????\">2.1 ???????????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">?????LLM ?????????????????????? LLM ??????????????????????????????????????? LLM ?????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>?????<\/strong>???????????????????????????????? \u2014\u2014 ????token???token?????????????????????????????<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li>??? LLM ??????????????????????????????? LLM ????<\/li>\n\n\n\n<li>LLM ????????????????????????????????token?????? LLM?<\/li>\n\n\n\n<li>??? LLM ??????????????? \/ ???????????????token?<\/li>\n\n\n\n<li>????????????????????????????????????????<\/li>\n<\/ol>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>?????<\/strong>?????????????? 95 ?? \/ 99 ?????????????????????????????????????????????????????????????????????????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">?????????????????<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>??????????????????? 8-16K token????token?????TTFT????prefill?????<\/li>\n\n\n\n<li>??????token????ITL?????????????? GPU ??????token?????????<\/li>\n\n\n\n<li>??????????????????????????????????????<\/li>\n<\/ul>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"22-kv-cache??????????\">2.2 KV Cache??????????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">KV Cache?????????????????? \u2014\u2014 ???token????token??????? K ? V ?????????? GPU ????KV Cache????????????token?????????????? LLM ??????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">????? Transformer ????????????????????????????????????KV Cache?????????????????????<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li><strong>????????? KV Cache?????<\/strong>??????????? KV ???????????????????????????????????????????????????????????????????????prefill????????????? TTFT ?????? GPU ?????<\/li>\n\n\n\n<li><strong>PD???????? KV Cache???<\/strong>?????????prefill?????????????decode??????token????????? GPU ??????????????????????????prefill??????????????<\/li>\n<\/ol>\n\n\n\n<p class=\"wp-block-paragraph\"><strong>KV Cache?????<\/strong><\/p>\n\n\n\n<p class=\"wp-block-paragraph\">?????????? KV Cache??? GPU ???CPU ???DRAM?NVMe ???????????????????<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li>??????????? KV Cache???????????????? KV Cache???????????????????????<\/li>\n\n\n\n<li>PD ???? GPU ?????? KV CaCHE????? PCIe?NVLink ? RDMA???????prefill???decode???????????<\/li>\n<\/ul>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"23-??-kv-?????\">2.3 ?? KV ?????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">????????????? PD ???????????????????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"??-1???????-io-??\">?? 1??????? I\/O ??<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">?? KV Cache???????? PyTorch ????torch.save\/torch.load????????????????????? 1GB????????????????????? KV ??????????????????????????????????????????? CPU-GPU ?????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">????????????? vLLM ? SGLang??????????????????????????????????? 16-64KB???????? KV Cache????????????vLLM ? Llama3.1-8B-Instruct ????? 62.5KB ????????????????????????????? KV Cache??????????????????? KV Cache?????????? I\/O ???????????????????????????????????????? 8 ??? Thor-2 400Gbps ??????? AMD GPU ???????????????? 16MB ????????????????????????????? 1-2MB??????? PCIe 5.0 ????? 75-80%?<\/p>\n\n\n\n<figure class=\"wp-block-table\"><table class=\"has-fixed-layout\"><thead><tr><th>????<\/th><th>?????<\/th><\/tr><\/thead><tbody><tr><td>64KB<\/td><td>4GBps<\/td><\/tr><tr><td>256KB<\/td><td>13GBps<\/td><\/tr><tr><td>1MB<\/td><td>30GBps<\/td><\/tr><tr><td>10MB<\/td><td>46GBps<\/td><\/tr><tr><td>16MB<\/td><td>49GBps<\/td><\/tr><tr><td>100MB<\/td><td>49GBps<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\"><em>? 1??? RCCL ????????????????<\/em><\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"??-2????????????\">?? 2????????????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">?? AI ???????? LLM ???????????2025 ????? 4 ??????? LLM ???????????????????????????????????? GPU ??????????? KV Cache?????????? vLLM ??????????? KV Cache????????KV Cache?????????????????? KV Cache????????????????????????????????????????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"??-3????????-api\">?? 3???????? API<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">?? KV ???? LLM ??????????? LLM ???????????????????????? KV Cache??????????????????????????????????????????????????????????????????????????????????????????????? KV Cache?????????????????? CPU ?????????token KV Cache????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">????????? KV ???????????????2025 ?????? LMCACHE ??????????????????????????????????????? KV Cache???????????????????????????????? API?????????? KV Cache??? KV Cache?????????? KV Cache?<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"24-???????????\">2.4 ???????????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">?????? KV ????????????????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"????\">????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">? 2025 ? 1 ? vLLM ???????????????????????? NVIDIA ? Dynamo?AIBrix?llm-d?SGLang OME ? KServe??????????????? Kubernetes ??????????????????????????????? KV ?????vLLM ????Dynamo?llm-d ? KServe ??? LMCACHE??<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"??????-kv-??\">?????? KV ??<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">vLLM ? SGLang ???????????? GPU-to-CPU KV ??????????????????????????????? KV Cache????????????? 7 ???????? LMCACHE ???<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"kv-cache???\">KV Cache???<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">Mooncake?Redis?InfiniStore ? 3FS ?????????????????????????????????????? \u201c???\u201d?????????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"????\">????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">Fireworks AI?Together AI ????? API ????????????????????????????????????????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"??????\">??????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">????????? KV Cache????????????PD ??? KV ?????????????????? HuggingFace Transformers ??????????????????????????????? SGLang ? vLLM ?????????????<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"3-lmcache-??\">3. LMCACHE ??<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ???????? KV Cache????????????????????????? KV Cache?????????????????????? PD ??????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">?? KV Cache????LMCACHE ??? LLM ????????? \/ ???????? 2???????????????? KV ??????????????? vLLM ? SGLang ???????????????<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img decoding=\"async\" width=\"348\" height=\"472\" src=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031154746815.png\" alt=\"A diagram illustrating the architecture of LMCACHE, a distributed KV cache layer situated between LLM inference engines (like vLLM and SGLang) and storage backends (e.g., Mooncake, Redis, infinistore). It shows the connections and data flow between these components.\" class=\"wp-image-949\" srcset=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031154746815.png 348w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031154746815-221x300.png 221w\" sizes=\"(max-width: 348px) 100vw, 348px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">?2 ???? LMCACHE ??????????????????<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"31-??\">3.1 ??<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">????????LMCACHE ????????? KV ??? GPU ?????????????? 3 ??????????????? KV ????????????????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"????\">????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">???????????? KV ??????token??????????? GPU ?????????????????token????????????????????token???????????????????????????????token? KV Cache??????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"????\">????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">?????????? KV Cache??????? KV ?????????token????????????????token????????????? ID ??????? \u2014\u2014 ??????????????????? GPU ????? GPU ???? KV Cache????? GPU ????????????? 4.2 ?????????????????? ID??????????????? KV Cache? CPU ?????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"????\">????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">?????????token? KV Cache???????????????????????????????????token????????? KV Cache?????token??? LMCACHE ??????? KV ???????? LMCACHE ???????token??????token??????????token?????<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"32-????\">3.2 ????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ???????????????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"lmcache-?????????\">LMCACHE ?????????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">??????????? LMCACHE ?????? KV Cache? GPU ???????????????????????? KV ???? CPU ????? PD ???GPU-GPU ?????????????????????? GPU ???????? I\/O ?????????????? vLLM ? SGLang ????????????16-64KB???????? GPU ????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"lmcache-?????????\">LMCACHE ?????????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">????????????????????? API????????????????? KV ????????????????????????????????????????????????????????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ????????????????????????????????? \u2014\u2014 ???????????????????????????????????<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img decoding=\"async\" width=\"1024\" height=\"675\" src=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031155209456-1024x675.png\" alt=\"A diagram illustrating the architecture of LMCACHE, showing various components such as the Inference Engine, Scheduler, Model Runner, KV Connector, Token Processor, Cache Controller, and Storage Manager. The layout includes connections to GPU memory, CPU memory, SSD, and different transfer channels like NVLink, RDMA, and TCP.\" class=\"wp-image-950\" srcset=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031155209456-1024x675.png 1024w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031155209456-300x198.png 300w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031155209456-768x506.png 768w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031155209456-1200x791.png 1200w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031155209456.png 1350w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">????? LMCACHE ????????????<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"4-????\">4. ????<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ?????????? KV ???????????????? LLM ??????LMCACHE ???????????<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li><p>?? LLM ???????????? KV ????????? Llama?Qwen?GPT-OSS ?????????? 20KB-63KB????????????????????????<\/p><\/li>\n\n\n\n<li><p>KV ???????? LLM ???????????????????????????? CUDA ??????????????????????? CUDA ????? CPU ????????????????????????<\/p><\/li>\n\n\n\n<li><p>LLM ??????????????? KV ????????????????????????????????????<\/p><\/li>\n<\/ol>\n\n\n\n<p class=\"wp-block-paragraph\">???????????????????????????????????? LMCACHE ??????<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"41-????\">4.1 ????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">?????? KV ??????? I\/O ?????LMCACHE ???????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"??????\">??????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ???????? KV ?????????????????????????? 256 ?token?????????? GPU ???????????????????? 16 ??? GPU ?????????????????????????? CPU ?????????????????? GPU ???????????????????? GPU ???LMCACHE ???? CUDA ???????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"????--????\">???? \/ ????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ?????????????? CPU ?????????????????? KV Cache???????KV Cache?????????????????????? KV ????? CPU ?????????????????????????????? KV Cache????????????????????? CPU ??????????????????LMCACHE ????????????????????????????? GPU ?????????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"????-kv-????\">???? KV ????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ???decode????????? KV cache?????????????????????????????????????? LMCACHE ???????????????????? I\/O ???<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"42-???io-??\">4.2 ???I\/O ??<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ???????? LLM ????????????decode????KV Cache????????????????????????????? LLM ??????????LMCACHE ???????? LLM ????? I\/O ??????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"?????\">?????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ????????? KV ????????????????????????????????????? CUDA ?????????????????? KV ????? GPU ???????????????????????????? KV ??????????????? KV ????????? KV ????????????????????????? KV ??????? GPU ???????????????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"???????\">???????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">??????????????????? KV Cache??????????????????100 ??????????????????? 50 ???? 50 ?????????LMCACHE ??????????????? KV ???????????????????????????? CPU ???????????????????? KV ????????????????????????????????????????SLO???????????????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"????\">????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">?? LMCACHE ????????????????????? CPU ?????? 5%-10% ??????????????LMCACHE ????????????????????????????????????????????????????????????????????????? CPU ?????????????????????? KV ?????????????????? CPU ?????????????????????? KV ?????????????<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"43??????\">4.3??????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">??? KV Cache?????????????????????????????????????????????????????LMCACHE ??????????????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"?????\">?????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">?? KV ???????????? GPU ???????????????LMCACHE ???????????????????????? KV ?????????????? CPU ???????????????LMCACHE ??????????????????????????????????????????????????????????????????????????????????????????? PCB ???????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"????\">????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">vLLM ???????? GPU ???????????????????????????LMCACHE ???????????? CPU ?????????????????????????<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><p>?????Start pointer??GPU ???????????????<\/p><\/li>\n\n\n\n<li><p>?????Current pointer?????? CPU ??????????<\/p><\/li>\n\n\n\n<li><p>?????End pointer???????????????????<\/p><\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">?? 4 ??????????????<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li><strong>??????State #1?<\/strong>??????????????? \/ ?????????????????????<\/li>\n\n\n\n<li><strong>??????State #2?<\/strong>??????????????????????????????? CPU ???<\/li>\n\n\n\n<li><strong>???????State #3?<\/strong>??????????????????????????????????????????? GPU ???<\/li>\n\n\n\n<li><strong>?????State #4?<\/strong>???????????????????????????<\/li>\n<\/ol>\n\n\n\n<p class=\"wp-block-paragraph\">?????????????????????????????????????????????????????????????? GPU ? CPU ????????????????????????????????????????????????????????????? 1 ?????????? 3 ??????????????????????? 2 ??????????????????????????????<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" decoding=\"async\" width=\"710\" height=\"842\" src=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031163932170.png\" alt=\"A diagram illustrating four states of page duplication in memory management, showing how free pages are allocated and duplicated in a circular buffer.\" class=\"wp-image-952\" srcset=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031163932170.png 710w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031163932170-253x300.png 253w\" sizes=\"(max-width: 710px) 100vw, 710px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">???????????????????????<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"5-??-kv-cache??????????????\">5. ?? KV Cache??????????????<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">vLLM ? SGLang ??? LLM ???????????????????????????2025 ?????? 15-20 ?????????????????????????????????????????????????????????????? KV ???????????? LMCACHE ?????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">????????LMCACHE ????? KV ????????? KV ?????????????????????????????LMCACHE ???????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">?????? vLLM ??LMCACHE ???????????<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li><p>???????token?????????????? LMCACHE ??????? LMCACHE ???????????token???????<\/p><\/li>\n\n\n\n<li><p>???????????????KV ??????????????????<\/p><\/li>\n<\/ol>\n\n\n\n<p class=\"wp-block-paragraph\">? 2 ??????????????? API ???????????????????????<\/p>\n\n\n\n<figure class=\"wp-block-table\"><table class=\"has-fixed-layout\"><thead><tr><th>????<\/th><th>??<\/th><\/tr><\/thead><tbody><tr><td>get_num_new_matched_tokens(query) ? matched_tokens<\/td><td>?? LMCACHE ????????token??<\/td><\/tr><tr><td>update_state_after_alloc(query, blocks, num_external_blocks)<\/td><td>????????? LMCACHE ???? KV ??<\/td><\/tr><tr><td>build_connector_meta(scheduler_output) ? kv_connector_metadata<\/td><td>?? KV ??? LMCACHE ??? GPU ????????????? KV ????? GPU ?????<\/td><\/tr><tr><td>start_load_kv(kv_pointers)<\/td><td>LLM ?????????????? GPU ???? KV ??<\/td><\/tr><tr><td>wait_load_kv(kv_pointers, layer_id)<\/td><td>?? KV ?????????????????<\/td><\/tr><tr><td>start_store_kv(kv_pointer)<\/td><td>???????? KV ??????????<\/td><\/tr><tr><td>wait_store_kv(kv_pointer, layer_id)<\/td><td>?? KV ????????????? KV ???????<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\">? 2 ??????? LMCACHE ??????????? KV ??????????????? vLLM ????????? LMCACHE KV ???????token??????????????????????????????????? LMCACHE KV ????????? KV ?????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">????????????<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li><p>?????????????<code>get_num_new_matched_tokens<\/code>??? LMCACHE ???????token???<\/p><\/li>\n\n\n\n<li><p><code>update_state_after_alloc<\/code>???? LMCACHE ?????token????? vLLM ????????????????????<\/p><\/li>\n\n\n\n<li><p>?????token??? 0???<code>build_connector_meta<\/code>?????????????? KV ?????????<\/p><\/li>\n\n\n\n<li><p>?????????????????????<\/p><\/li>\n<\/ol>\n\n\n\n<ul class=\"wp-block-list\">\n<li><p>??<code>start_load_kv<\/code>????? KV ??? GPU ??????<\/p><\/li>\n\n\n\n<li><p>?? LLM ??????????<code>wait_load_kv<\/code>????? KV ???????????? KV ????<\/p><\/li>\n\n\n\n<li><p>????????????<code>wait_store_kv<\/code>????? KV ??????????<code>start_store_kv<\/code>???????? KV ??????<\/p><\/li>\n<\/ul>\n\n\n\n<p class=\"wp-block-paragraph\">???????????<\/p>\n\n\n\n<ul class=\"wp-block-list\">\n<li><p>??? LLM ????????<code>start_load_kv<\/code>???????? KV ????? GPU ???????? KV ??????? GPU ????????<\/p><\/li>\n\n\n\n<li><p>??????? LLM ????????<code>start_store_kv<\/code>???? KV ????????????<\/p><\/li>\n<\/ul>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"6-?????\">6. ?????<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">?????????????? KV ?????????????????????????????token?????????????????????LMCACHE ???????? KV ??? API?? 3??????????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ? KV ?????????????????????????????????????????????????????? LMCACHE ????????????????????????????????<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"??-kv-cache?????\">?? KV Cache?????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">??????????????????????????<\/p>\n\n\n\n<ol class=\"wp-block-list\">\n<li><p>?????<code>lookup(tokens)<\/code>????????????????? LMCACHE ???????????token????<\/p><\/li>\n\n\n\n<li><p>??<code>query_ip(instance_ids)<\/code>??? ID ??? IP ???<\/p><\/li>\n\n\n\n<li><p>????????token????? IP ???<\/p><\/li>\n<\/ol>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"kv-????\">KV ????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">??? KV ?????????????????????? KV ?????<code>move(source, destination, tokens)<\/code>?????token??? KV ??????????????<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"kv-????\">KV ????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">????????????????????????<code>clear(tokens, instance, location)<\/code>????????????????? GPU ???CPU ??????token KV ???<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"gpu-?????-kv-??\">GPU ????? KV ??<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">???????????????????????? GPU ????????????<code>pin(instance, location, tokens)<\/code>??????????token? KV ?????????????GPU ????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">????????????????? KV ?????<code>compress(tokens, instance, location, compression_method)<\/code>????????????????????????? KV ??????????????????????????????????? KV ???<\/p>\n\n\n\n<figure class=\"wp-block-table\"><table class=\"has-fixed-layout\"><thead><tr><th>??<\/th><th>??<\/th><\/tr><\/thead><tbody><tr><td>lookup(tokens) ? {instance_id: hit_tokens}<\/td><td>???????token??????? KV ??????????token?<\/td><\/tr><tr><td>query_ip(instance_ids) ? IP<\/td><td>?????? ID ? IP ??<\/td><\/tr><tr><td>move(source, destination, tokens)<\/td><td>???token? KV ?????????????<\/td><\/tr><tr><td>clear(tokens, instance_id, storage_device)<\/td><td>?????????????????token? KV ??<\/td><\/tr><tr><td>pin(tokens, instance, storage_device)<\/td><td>???token? KV ?????????????????<\/td><\/tr><tr><td>compress(tokens, instance, storage_device, compression_method)<\/td><td>????????????????????????token KV ???????????<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\"><em>? 3?LMCACHE ?????????<\/em><\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"7-??\">7 ??<\/h2>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"71-????\">7.1 ????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">???????????? LMCACHE?? 4???????? LMCACHE ??????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"??\">??<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">?? LMCACHE ??????????????????????meta-llama\/Llama-3.1-8B-Instruct?meta-llama\/Llama-3.1-70B-Instruct?Qwen\/Qwen2.5-Coder-32B-Instruct?Qwen\/Qwen3-Coder-480B-A35B-Instruct-FP8?Qwen\/Qwen2.5-72B-Instruct?<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"???\">???<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">??????????????????LongBench ?????????????? vLLM ????????????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"??\">??<\/h4>\n\n\n\n<ul class=\"wp-block-list\">\n<li><p>???????? GMI Cloud ??? 8\u00d7H100 ?????????????????????? H100 GPU?<\/p><\/li>\n\n\n\n<li><p>??????????????? GPU ??????? CPU ???? KV ??????????<\/p><\/li>\n\n\n\n<li><p>PD ????????????????????????? GPU ????? NVLink ???<\/p><\/li>\n<\/ul>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"????\">????<\/h4>\n\n\n\n<ul class=\"wp-block-list\">\n<li><p>?token?????TTFT????????<\/p><\/li>\n\n\n\n<li><p>token????ITL????????token????????<\/p><\/li>\n\n\n\n<li><p>?????????? CPU ??? PD ?????????<\/p><\/li>\n<\/ul>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"????\">????<\/h4>\n\n\n\n<ul class=\"wp-block-list\">\n<li><p>vLLM?????????? GPU ??????? KV ???<\/p><\/li>\n\n\n\n<li><p>???? 1?2?3??????????????? GPU ????????<\/p><\/li>\n<\/ul>\n\n\n\n<figure class=\"wp-block-table\"><table class=\"has-fixed-layout\"><thead><tr><th>??<\/th><th>??? \/ ???<\/th><th>????<\/th><th>??????<\/th><\/tr><\/thead><tbody><tr><td>CPU ??<\/td><td>???<\/td><td>&#8211;<\/td><td>??? CPU ????<\/td><\/tr><tr><td>?????<\/td><td>???<\/td><td>???<\/td><td>??????????<\/td><\/tr><tr><td>PD ??<\/td><td>???<\/td><td>NVLink<\/td><td>PD????<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\"><em>? 4???????<\/em><\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"72-???-cpu-??\">7.2 ??? CPU ??<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">? CPU ??????? 4??????????????????????????????????? LLM ???? 10K token?? 12 ? PDF ????????????????????Llama-3.1-8B-Instruct ???? 20K token?????????????????????LLM ???? 100 ?token???????????? 40 ???????????????QPS?????? LMCACHE ??? KV ????? CPU ??? 500GB?<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">?? 5 ???LMCACHE ? TTFT ? ITL ???????????????????? QPS?? QPS=1???LMCACHE ??????????????????????????????? 2.3-14 ??????? TTFT?? ITL ???LMCACHE ???????? \u2014\u2014 ????????token???????token????????????????? 1 ? 2 ????? Qwen3-Coder-480B ???<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"408\" src=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170012691-1024x408.png\" alt=\"A comparative performance graph showing TTFT and ITL metrics for various AI models (Llama3.1-8B, Llama3.1-70B, Qwen2.5-72B-Instruct, Qwen2.5-Coder-32B, and Qwen3-Coder-480B) across different query per second (QPS) values. The graph features data points for LMCache, Naive vLLM, and two commercial systems, highlighting the differences in response time and throughput.\" class=\"wp-image-953\" srcset=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170012691-1024x408.png 1024w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170012691-300x120.png 300w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170012691-768x306.png 768w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170012691-1200x478.png 1200w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170012691.png 1400w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\"><em>?5???? LMCACHE ???????????? TTFT?ITL ? QPS ??<\/em><\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"lmcache-??????\">LMCACHE ??????<\/h4>\n\n\n\n<ul class=\"wp-block-list\">\n<li><p>??? GPU ??????? KV ??? vLLM ?????????LMCACHE ?? CPU ????? KV ???????????????????????? CPU-GPU ??????????????????<\/p><\/li>\n\n\n\n<li><p>???? 1 ???? KV ?????????????????? 2 ????????????? LMCACHE????????????????????????<\/p><\/li>\n<\/ul>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"73-????????\">7.3 ????????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">??????????? 4???? LMCACHE ?? 15Gbps ???? GPU ????????????????? LongBench ?? TriviaQA ??????????????????? vLLM ?????????????????? QPS ??????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">?? 7 ???LMCACHE ??? QPS ?????????????????????? 1.3-3 ???????????????????? CPU ????? KV ??????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">????????????? KV ???????? CPU ?????????????????????????????????????????????????????????? 7.7 ????????<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"444\" src=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170130769-1024x444.png\" alt=\"Comparison of LMCACHE, Naive vLLM, and Commercial 1 across different models (Llama3.1-70B, Qwen2.5-72B-Instruct, Qwen2.5-Coder-32B, Qwen3-Coder-480B) showing TTFT and ITL metrics as QPS increases.\" class=\"wp-image-954\" srcset=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170130769-1024x444.png 1024w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170130769-300x130.png 300w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170130769-768x333.png 768w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170130769-1200x520.png 1200w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170130769.png 1288w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\"><em>?7???? LMCACHE ?????????????? TTFT?ITL ? QPS ??<\/em><\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"74-pd-??\">7.4 PD ??<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">? PD ?????????????????????????????? LMCACHE ? vLLM ?? PD ???????? 8K token??? 200 token???? 8 ???LMCACHE ? 95 ?? TTFT ???? vLLM ?? PD ?????? TTFT ???LMCACHE ?????? \u2014\u2014 ???????? TTFT ?? 1.53-1.84 ???? ITL ?? 1.12-1.66 ??<\/p>\n\n\n\n<figure class=\"wp-block-image size-large\"><img loading=\"lazy\" decoding=\"async\" width=\"1024\" height=\"411\" src=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170153026-1024x411.png\" alt=\"Graphs comparing the performance of LMCACHE and naive vLLM for different models (Llama3-8B, Llama3-70B, Qwen-32B, Qwen-72B), displaying cumulative distribution functions (CDF) for TTFT (Time to First Token) and ITL (Inter-Token Latency).\" class=\"wp-image-956\" srcset=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170153026-1024x411.png 1024w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170153026-300x120.png 300w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170153026-768x308.png 768w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170153026-1200x482.png 1200w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170153026.png 1390w\" sizes=\"(max-width: 1024px) 100vw, 1024px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\"><em>?8????????? LMCACHE ? vLLM ?? PD ??????????<\/em><\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"lmcache-??????-1\">LMCACHE ??????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ?? PD ?????????????????????????????? KV ?????? GPU ??????????????????????? KV ????????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">?????vLLM ?? PD ?????? NIXL ?????????????????? KV ??????????????????? KV ?????????????????????????? KV ??????????????? GPU ??????????????????????????? 4 ??????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">? 11 ??? LLM ???????????????????????????????? KV ??????LMCACHE ? vLLM ?? PD ????????????????? LMCACHE ?????? KV ?????????????????? PD ????????????<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" decoding=\"async\" width=\"622\" height=\"272\" src=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170304877.png\" alt=\"A bar chart comparing the execution times of LMCache and naive vLLM, displaying the duration of prefill, decode, and transmit phases.\" class=\"wp-image-957\" srcset=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170304877.png 622w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170304877-300x131.png 300w\" sizes=\"(max-width: 622px) 100vw, 622px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\"><em>?11???? LMCACHE ? vLLM ?? PD ?????????????????<\/em><\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"75-????????\">7.5 ????????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">???????????????? LMCACHE?????????????????????????????????????????????????????????????????? 4K token?<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">????????????????? Sao10K\/L3-8B-Lunaris-v1 ??????????????????????????????????????? 1 ??????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">?? 6 ?????? QPS ??LMCACHE ?????????????? vLLM????? TTFT ? ITL???????????????????QPS ? 2-5 ??LMCACHE ????? vLLM ?? 25%?QPS ? 6 ??????? 49%?<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" decoding=\"async\" width=\"694\" height=\"310\" src=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170228014-1.png\" alt=\"A graphical comparison of TTFT (in seconds) and ITL (in milliseconds) between LMCache and Naive vLLM across different QPS levels. The left graph shows TTFT values, while the right graph displays ITL values.\" class=\"wp-image-959\" srcset=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170228014-1.png 694w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170228014-1-300x134.png 300w\" sizes=\"(max-width: 694px) 100vw, 694px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\"><em>?6???? LMCACHE ??? vLLM ??????? TTFT?ITL ? QPS ??<\/em><\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"76-?????\">7.6 ?????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">?????? LMCACHE ?????????????????????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"cpu-??\">CPU ??<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">? 5 ??? LMCACHE ? vLLM ?? CPU ??? CPU ?? KV ????????LMCACHE ????????? vLLM ?????????????????vLLM ?? CPU ????????????? LMCACHE ??????????????? CUDA ???????????????????????????????????????????LMCACHE ?????????????????????????????????<\/p>\n\n\n\n<figure class=\"wp-block-table\"><table class=\"has-fixed-layout\"><thead><tr><th>??<\/th><th>????<\/th><\/tr><\/thead><tbody><tr><td>LMCACHE<\/td><td>400 Gbps<\/td><\/tr><tr><td>vLLM ?? CPU ??<\/td><td>88 Gbps<\/td><\/tr><\/tbody><\/table><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\"><em>? 5?LMCACHE ? vLLM ?? CPU ?????????<\/em><\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"????\">????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">? 10 ??? LMCACHE ?????????????????????????????????????????????? \/ ????????????????????????? \/ ?????? KV ???????????????? 1.46 ??<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" decoding=\"async\" width=\"682\" height=\"726\" src=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170328296.png\" alt=\"A comparison of request handling between LMCACHE (using asynchronous input\/output) and vLLM (using synchronous input\/output). The chart displays the duration of different phases: prefill, decode, and loading for various request IDs over a timeline.\" class=\"wp-image-961\" srcset=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170328296.png 682w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170328296-282x300.png 282w\" sizes=\"(max-width: 682px) 100vw, 682px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\"><em>?10???? LMCACHE ?? I\/O ???? KV ?????????<\/em><\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"77-?????\">7.7 ?????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">????????????????????????? LMCACHE ??????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"????????\">????????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">? 12 ??? B200 ?????????????????????????????????32 Gbps?????????????? 256K token?LMCACHE ? KV ???????????????????64 ? 128 Gbps???LMCACHE ??????????????????????????????LMCACHE ? KV ???????????????????????????????????????????????<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" decoding=\"async\" width=\"518\" height=\"370\" src=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170348206.png\" alt=\"A graph showing the relationship between context length (in K tokens) and latency (in seconds) for different bandwidths: 32 Gbps, 64 Gbps, and 128 Gbps, along with a line for prefill latency.\" class=\"wp-image-962\" srcset=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170348206.png 518w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170348206-300x214.png 300w\" sizes=\"(max-width: 518px) 100vw, 518px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\"><em>?12????????????????????? \/ ???????<\/em><\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"78-sglang-????\">7.8 SGLang ????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">???????? vLLM?????? LMCACHE ? SGLang ??????? 9 ?????? H100 GPU ??? Qwen3-32B ???TP=2???? LMCACHE CPU ?????????? CPU ??? SGLang ???LMCACHE ??????????????? TTFT ???????? SGLang ?? CPU ?????LMCACHE ???????????? LMCACHE ???????????????????SGLang ?? CPU ???????? LMCACHE ???????????????????????? CPU \/ ????????????????????<\/p>\n\n\n\n<figure class=\"wp-block-image size-full\"><img loading=\"lazy\" decoding=\"async\" width=\"468\" height=\"862\" src=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170400575.png\" alt=\"A graph showing performance metrics for SGL with and without LMCache, comparing throughput (QPS), average time to first token (TTFT), and average latency at different request rates (QPS). The graph features lines and markers representing SGL with LMCache, standard SGL, and SGL with CPU offloading.\" class=\"wp-image-963\" srcset=\"https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170400575.png 468w, https:\/\/identia.digital\/lmcache\/wp-content\/uploads\/2025\/11\/image-20251031170400575-163x300.png 163w\" sizes=\"(max-width: 468px) 100vw, 468px\" \/><\/figure>\n\n\n\n<p class=\"wp-block-paragraph\"><em>?9???????????LMCACHE ? SGLang ??????????? TTFT ?????<\/em><\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"8-?????????\">8. ?????????<\/h2>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"81-kv-???????????\">8.1 KV ???????????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ????????????????????? KV ??????????? GPU ??????????????? KV ????? GPU ?????????????????????? GPU ??????????????? KV ????? CPU ???CPU ???????????????????????????????? \u2014\u2014 ???????????????????????????????? TTFT?<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">??????????????????????????????????????????? GPU ????????????????????? LMCACHE ? KV ???????? 4 ???????????? KV ?????????? GPU ?????????????????????? KV ???????????????????????token????????????????????????????????????????????? KV ?????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">??????????????? CPU ??? PD ???????????????? KV ??????????????????????? CPU ???????? CPU ??? PD ????????<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"82-???????????\">8.2 ???????????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">???????? KV ??????????????????????????????????????????? LLM ????????? KV ??????????????????LLM ????????????????token???????????????????????????????????? KV ??????????????????? LMCACHE ????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">??????????? KV ????????????????????????????????????????????? GPU ???????token???????????????????????????????????????????????????????????????????????????? KV ??????????????????????????????????????????????????????? GPU ?????????????????????????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">????????????????????????? KV ??????????????????????????????????? \u201c??\u201d ???????? KV ?????? KV ????????????????????????? I\/O ??????????????????????????????? KV ???????????????<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"83-???????\">8.3 ???????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">???? LLM ??????????????????????? LMCACHE ???????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"???????\">???????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">??????????????????????? Docker ??????????????????? LMCACHE ?????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"???????\">???????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">??????????????????????????????????????????????????????????????????????LMCACHE ????? KV ??????????????????LMCACHE ???????? KV ?????????????? KV ???????????????????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"??????????????\">??????????????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">??????????????????????????????????? KV ????????????????????????????????????? LMCACHE ?????????????????????LLM ?????????? KV ???????????<\/p>\n\n\n\n<h4 class=\"wp-block-heading\" id=\"???????????\">???????????<\/h4>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ????????????????LMCACHE ???? vLLM ??????? SGLang ???????????????????????????????????LMCACHE ?????????????????????????<\/p>\n\n\n\n<h3 class=\"wp-block-heading\" id=\"84-??????????????\">8.4 ??????????????<\/h3>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ????? 2024 ??????????? KV ????????????????????? 2025 ????????????? \u2014\u2014KV ?????????? LLM ??????????2025 ???LMCACHE ?????????????????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">??????? LMCACHE ?????????????????????????????????????????????????????????? KV ????token????????????? LMCACHE ???token?????? \u2014\u2014LMCACHE ????????token????????????????????? LMCACHE ???????????????????????????? LMCACHE ???<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">?????????????????? LLM ????????????????????? KV ???????? Rust ? C++ ?????????????? Python ?? LMCACHE??????????????Python ????????????????????????????????????????????LMCACHE ???????????????? CUDA ???<\/p>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"9-?????\">9. ?????<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">???? LMCACHE\u2014\u2014 ?????????????? LLM ?? KV ??????? KV ???????????????????????LMCACHE ? LLM ??????token?????????????????????????????????????LMCACHE ???????????????????????? API????????LMCACHE ????????????????? CPU ???????? PD ????????token?????????????????????????????????????? KV ???????????????????????? LMCACHE ????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">?????LMCACHE ????????????KV ??? AI ????????? LLM ???????????????????? KV ???????????????LMCACHE ?????????? \u2014\u2014 ????????????????????????????????????????????????????????????? LLM ?????????? KV ??? AI ?????????????????????????????????<\/p>\n\n\n\n<p class=\"wp-block-paragraph\">LMCACHE ??????<a href=\"https:\/\/github.com\/LMCACHE\/LMCACHE\">https:\/\/github.com\/LMCACHE\/LMCACHE<\/a>?<\/p>\n\n\n\n<hr class=\"wp-block-separator has-alpha-channel-opacity\"\/>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"10-??\">10. ??<\/h2>\n\n\n\n<p class=\"wp-block-paragraph\">?? LMCACHE ?????????????Baolong Mao and Chunxiao Zheng????????????Martin Hickey??? GitHub ??????Huaizheng Zhang?Siddhant Ray?Gu Zhuohan?Hanchen Li????????????Rui Zhang ??? LMCACHE ????Qizheng Zhang?Hussain Mohammad?????????<\/p>\n\n\n\n<h2 class=\"wp-block-heading\" id=\"????\">????<\/h2>\n\n\n\n<ol class=\"wp-block-list\">\n<li><p>Best 44 large language models (LLMs) in 2025. <a href=\"https:\/\/explodingtopics.com\/blog\/list-of-llms\">https:\/\/explodingtopics.com\/blog\/list-of-llms<\/a>, 2025.<\/p><\/li>\n\n\n\n<li><p>Bai Y, Lv X, Zhang J, et al. Longbench: A bilingual, multitask benchmark for long context understanding, 2024. <a href=\"https:\/\/arxiv.org\/abs\/2308.14508\">https:\/\/arxiv.org\/abs\/2308.14508<\/a>.<\/p><\/li>\n\n\n\n<li><p>ByteDance. InfiniStore: Kv cache store for distributed llm inference. <a href=\"https:\/\/github.com\/bytedance\/InfiniStore\">https:\/\/github.com\/bytedance\/InfiniStore<\/a>, 2025.<\/p><\/li>\n\n\n\n<li><p>Caylent. Prompt caching: Saving time and money in llm applications. <a href=\"https:\/\/caylent.com\/blog\/prompt-caching-saving-time-and-money-in-llm-applications\">https:\/\/caylent.com\/blog\/prompt-caching-saving-time-and-money-in-llm-applications<\/a>, 2024.<\/p><\/li>\n\n\n\n<li><p>Chen S, Jiang R, Yu D, et al. Kvdirect: Distributed disaggregated llm inference, 2024. <a href=\"https:\/\/arxiv.org\/abs\/2501.14743\">https:\/\/arxiv.org\/abs\/2501.14743<\/a>.<\/p><\/li>\n\n\n\n<li><p>Chen W, He S, Qu H, et al. IMPRESS: An Importance-Informed Multi-Tier prefix KV storage system for large language model inference. In: 23rd USENIX Conference on File and Storage Technologies (FAST 25), Santa Clara, CA, 2025: 187-201.<\/p><\/li>\n\n\n\n<li><p>DeepSeek AI Contributors. deepseek-ai\/3fs: A high-performance distributed file system for ai training and inference workloads. <a href=\"https:\/\/github.com\/deepseek-ai\/3FS\">https:\/\/github.com\/deepseek-ai\/3FS<\/a>, 2025a.<\/p><\/li>\n\n\n\n<li><p>KServe Contributors. kserve\/kserve: Standardized distributed generative and predictive ai inference platform for scalable, multi-framework deployment on kubernetes. <a href=\"https:\/\/github.com\/kserve\/kserve\">https:\/\/github.com\/kserve\/kserve<\/a>, 2025b.<\/p><\/li>\n\n\n\n<li><p>Databricks Research. How long should you train your language model? <a href=\"https:\/\/www.databricks.com\/blog\/how-long-should-you-train-your-language-model\">https:\/\/www.databricks.com\/blog\/how-long-should-you-train-your-language-model<\/a>, 2024.<\/p><\/li>\n\n\n\n<li><p>Du D, Cao S, Cheng J, et al. Bitdecoding: Unlocking tensor cores for long-context llms with low-bit kv cache, 2025. <a href=\"https:\/\/arxiv.org\/abs\/2503.18773\">https:\/\/arxiv.org\/abs\/2503.18773<\/a><\/p><\/li>\n\n\n\n<li><p>Gao B, He Z, Sharma P, et al. Cost-efficient large language model serving for multi-turn conversations with cached-attention, 2024. <a href=\"https:\/\/arxiv.org\/abs\/2403.19708\">https:\/\/arxiv.org\/abs\/2403.19708<\/a>.<\/p><\/li>\n\n\n\n<li><p>Ge S, Zhang Y, Liu L, et al. Model tells you what to discard: Adaptive kv cache compression for llms, 2024. <a href=\"https:\/\/arxiv.org\/abs\/2310.01801\">https:\/\/arxiv.org\/abs\/2310.01801<\/a>.<\/p><\/li>\n\n\n\n<li><p>Gim I, Chen G, Lee S S, et al. Prompt cache: Modular attention reuse for low-latency inference. In: Proceedings of the Seventh Annual Conference on Machine Learning and Systems (MLSys 2024), Santa Clara, CA, 2024.<\/p><\/li>\n\n\n\n<li><p>GMI Cloud. Gmi cloud: Gpu cloud solutions for scalable ai &amp; inference. <a href=\"https:\/\/www.gmicloud.ai\/\">https:\/\/www.gmicloud.ai\/<\/a>, 2025.<\/p><\/li>\n\n\n\n<li><p>Jegou S, Jeblick M, Devoto A, et al. Kvpress: Efficient kv cache compression for long-context llms, 2024. <a href=\"https:\/\/github.com\/NVIDIA\/kvpress\">https:\/\/github.com\/NVIDIA\/kvpress<\/a>.<\/p><\/li>\n\n\n\n<li><p>Jin C, Zhang Z, Jiang X, et al. Ragcache: Efficient knowledge caching for retrieval-augmented generation, 2024. <a href=\"https:\/\/arxiv.org\/abs\/2404.12457\">https:\/\/arxiv.org\/abs\/2404.12457<\/a>.<\/p><\/li>\n\n\n\n<li><p>Jin S, Liu X, Zhang Q, et al. Compute or load KV cache? why not both? In: Forty-second International Conference on Machine Learning, 2025a.<\/p><\/li>\n\n\n\n<li><p>Jin S, Liu X, Zhang Q, et al. Compute or load KV cache? why not both? In: Forty-second International Conference on Machine Learning, 2025b.<\/p><\/li>\n\n\n\n<li><p>Kwon W, et al. Demystifying nccl: An in-depth analysis of gpu-based collective communication. arXiv preprint arXiv:2507.04786, 2025.<\/p><\/li>\n\n\n\n<li><p>Kwon W, Li Z, Zhuang S, et al. Efficient memory management for large language model serving with paged-attention. In: Proceedings of the 29th Symposium on Operating Systems Principles (SOSP \u201923), New York, NY, 2023a: 611-626.<\/p><\/li>\n\n\n\n<li><p>Kwon W, Li Z, Zhuang S, et al. Efficient memory management for large language model serving with paged-attention, 2023b. <a href=\"https:\/\/arxiv.org\/abs\/2309.06180\">https:\/\/arxiv.org\/abs\/2309.06180<\/a>.<\/p><\/li>\n\n\n\n<li><p>Kwon W, Li Z, Zhuang S, et al. Efficient memory management for large language model serving with paged-attention. In: Proceedings of the ACM SIGOPS 29th Symposium on Operating Systems Principles, 2023c.<\/p><\/li>\n\n\n\n<li><p>Lee W, Lee J, Seo J, et al. InfiniGen: Efficient generative inference of large language models with dynamic KV cache management. In: 18th USENIX Symposium on Operating Systems Design and Implementation (OSDI 24), Santa Clara, CA, 2024: 155-172.<\/p><\/li>\n\n\n\n<li><p>Li J, Zhang Y, Hassan M Y, et al. Commvq: Commutative vector quantization for kv cache compression, 2025. <a href=\"https:\/\/arxiv.org\/abs\/2506.18879\">https:\/\/arxiv.org\/abs\/2506.18879<\/a>.<\/p><\/li>\n\n\n\n<li><p>Li Y, Huang Y, Yang B, et al. Snapkv: Llm knows what you are looking for before generation, 2024. <a href=\"https:\/\/arxiv.org\/abs\/2404.14469\">https:\/\/arxiv.org\/abs\/2404.14469<\/a>.<\/p><\/li>\n\n\n\n<li><p>Liu J, Chung J W, Wu Z, et al. Andes: Defining and enhancing quality-of-experience in llm-based text streaming services, 2024a. <a href=\"https:\/\/arxiv.org\/abs\/2404.16283\">https:\/\/arxiv.org\/abs\/2404.16283<\/a>.<\/p><\/li>\n\n\n\n<li><p>Liu Y, Li H, Cheng Y, et al. Cachegen: Kv cache compression and streaming for fast large language model serving, 2024b. <a href=\"https:\/\/arxiv.org\/abs\/2310.07240\">https:\/\/arxiv.org\/abs\/2310.07240<\/a>.<\/p><\/li>\n\n\n\n<li><p>Liu Z, Yuan J, Jin H, et al. Kivi: A tuning-free asymmetric 2bit quantization for kv cache. arXiv preprint arXiv:2402.02750, 2024c.<\/p><\/li>\n\n\n\n<li><p>llm-d Project. llm-d: A kubernetes-native high-performance distributed llm inference framework. <a href=\"https:\/\/github.com\/llm-d\/llm-d\">https:\/\/github.com\/llm-d\/llm-d<\/a>, 2025.<\/p><\/li>\n\n\n\n<li><p>Meta Engineering. Roce networks for distributed ai training at scale. <a href=\"https:\/\/engineering.fb.com\/2024\/08\/05\/data-center-engineering\/roce-network-distributed-ai-training-at-scale\/\">https:\/\/engineering.fb.com\/2024\/08\/05\/data-center-engineering\/roce-network-distributed-ai-training-at-scale\/<\/a>, 2024.<\/p><\/li>\n\n\n\n<li><p>Nie C, Fonseca R, Liu Z. Aladdin: Joint placement and scaling for slo-aware llm serving, 2024. <a href=\"https:\/\/arxiv.org\/abs\/2405.06856\">https:\/\/arxiv.org\/abs\/2405.06856<\/a>.<\/p><\/li>\n\n\n\n<li><p>NVIDIA Corporation. Nvidia dynamo: A datacenter-scale distributed inference serving framework. <a href=\"https:\/\/github.com\/ai-dynamo\/dynamo\">https:\/\/github.com\/ai-dynamo\/dynamo<\/a>, 2025.<\/p><\/li>\n\n\n\n<li><p>NVIDIA Developer Forums. Why is the transfer throughput low when transferring small size data (gpu host\/device transfers). <a href=\"https:\/\/forums.developer.nvidia.com\/t\/why-is-the-transfer-throughput-low-when-transferring-small-size-data-from-host-to-device-or-device-to-host\/153962\">https:\/\/forums.developer.nvidia.com\/t\/why-is-the-transfer-throughput-low-when-transferring-small-size-data-from-host-to-device-or-device-to-host\/153962<\/a>, 2020.<\/p><\/li>\n\n\n\n<li><p>Patel P, Choukse E, Zhang C, et al. Splitwise: Efficient generative llm inference using phase splitting, 2024. <a href=\"https:\/\/arxiv.org\/abs\/2311.18677\">https:\/\/arxiv.org\/abs\/2311.18677<\/a>.<\/p><\/li>\n\n\n\n<li><p>Qin R, Li Z, He W, et al. Mooncake: Trading more storage for less computation &#8211; a KVCache-centric architecture for serving LLM chatbot. In: 23rd USENIX Conference on File and Storage Technologies (FAST 25), Santa Clara, CA, 2025a: 155-170.<\/p><\/li>\n\n\n\n<li><p>Qin R, Li Z, He W, et al. Mooncake: A kvcache-centric disaggregated architecture for llm serving, 2025b. <a href=\"https:\/\/arxiv.org\/abs\/2407.00079\">https:\/\/arxiv.org\/abs\/2407.00079<\/a>.<\/p><\/li>\n\n\n\n<li><p>Qin Z, Cao Y, Lin M, et al. Cake: Cascading and adaptive kv cache eviction with layer preferences, 2025c. <a href=\"https:\/\/arxiv.org\/abs\/2503.12491\">https:\/\/arxiv.org\/abs\/2503.12491<\/a>.<\/p><\/li>\n\n\n\n<li><p>Redis. Redis enterprise software reference &#8211; redis documentation. <a href=\"https:\/\/redis.io\/docs\/latest\/operate\/rs\/references\/\">https:\/\/redis.io\/docs\/latest\/operate\/rs\/references\/<\/a>, 2025<\/p><\/li>\n\n\n\n<li><p>Ren Z, Doekemeijer K, De Matteis T, et al. An i\/o characterizing study of offloading llm models and kv caches to nvme ssd. In: Proceedings of the 5th Workshop on Challenges and Opportunities of Efficient and Performant Storage Systems (CHEOPS \u201925), New York, NY, 2025: 23-33.<\/p><\/li>\n\n\n\n<li><p>Shi X, Cai C, Du J, et al. Nexus: proactive intra-gpu disaggregation of prefill and decode in llm serving, 2025. <a href=\"https:\/\/arxiv.org\/abs\/2507.06608\">https:\/\/arxiv.org\/abs\/2507.06608<\/a>.<\/p><\/li>\n\n\n\n<li><p>Strecker W D. Vax-11\/780: A virtual address extension to the dec pdp-11 family. In: Proceedings of the National Computer Conference, Montvale, NJ, 1978: 967-980.<\/p><\/li>\n\n\n\n<li><p>Tang J, Zhao Y, Zhu K, et al. Quest: Query-aware sparsity for efficient long-context llm inference, 2024. <a href=\"https:\/\/arxiv.org\/abs\/2406.10774\">https:\/\/arxiv.org\/abs\/2406.10774<\/a>.<\/p><\/li>\n\n\n\n<li><p>The AIBrix Team, Shan J, Gupta V, et al. Aibrix: Towards scalable, cost-effective large language model inference infrastructure, 2025. <a href=\"https:\/\/arxiv.org\/abs\/2504.03648\">https:\/\/arxiv.org\/abs\/2504.03648<\/a>.<\/p><\/li>\n\n\n\n<li><p>The SGLang Team. Ome: Revolutionizing llm infrastructure with model-driven architecture. <a href=\"https:\/\/lmsys.org\/blog\/2025-07-08-ome\/\">https:\/\/lmsys.org\/blog\/2025-07-08-ome\/<\/a>, 2025.<\/p><\/li>\n\n\n\n<li><p>UCCL Team. Everything you want to know about kv cache transfer engine. <a href=\"https:\/\/uccl-project.github.io\/posts\/kv-transfer-engine\/\">https:\/\/uccl-project.github.io\/posts\/kv-transfer-engine\/<\/a>, 2025.<\/p><\/li>\n\n\n\n<li><p>vLLM project. vllm production stack: Reference system for k8s-native cluster-wide deployment with community-driven performance optimization. <a href=\"https:\/\/github.com\/vllm-project\/production-stack\">https:\/\/github.com\/vllm-project\/production-stack<\/a>, 2025.<\/p><\/li>\n\n\n\n<li><p>Xiao G, Tang J, Zuo J, et al. Duoattention: Efficient long-context llm inference with retrieval and streaming heads, 2024a. <a href=\"https:\/\/arxiv.org\/abs\/2410.10819\">https:\/\/arxiv.org\/abs\/2410.10819<\/a>.<\/p><\/li>\n\n\n\n<li><p>Xiao G, Tian Y, Chen B, et al. Efficient streaming language models with attention sinks, 2024b. <a href=\"https:\/\/arxiv.org\/abs\/2309.17453\">https:\/\/arxiv.org\/abs\/2309.17453<\/a>.<\/p><\/li>\n\n\n\n<li><p>Xie Z, Xu Z, Zhao M, et al. Strata: Hierarchical context caching for long context language model serving, 2025. <a href=\"https:\/\/arxiv.org\/abs\/2508.18572\">https:\/\/arxiv.org\/abs\/2508.18572<\/a>.<\/p><\/li>\n\n\n\n<li><p>Yang H, Zhang R, Huang M, et al. Kvshare: An llm service system with efficient and effective multi-tenant kv cache reuse, 2025. <a href=\"https:\/\/arxiv.org\/abs\/2503.16525\">https:\/\/arxiv.org\/abs\/2503.16525<\/a>.<\/p><\/li>\n\n\n\n<li><p>Ye L, Tao Z, Huang Y, et al. ChunkAttention: Efficient self-attention with prefix-aware KV cache and two-phase partition. In: Proceedings of the 62nd Annual Meeting of the Association for Computational Linguistics (Volume 1: Long Papers), Bangkok, Thailand, 2024: 11608-11620.<\/p><\/li>\n\n\n\n<li><p>Yu L, Lin J, Li J. Stateful large language model serving with pensieve. In: Proceedings of the Twentieth European Conference on Computer Systems (EuroSys \u201925), New York, NY, 2025: 144-158.<\/p><\/li>\n\n\n\n<li><p>Zhang H, Ji X, Chen Y, et al. Pqcache: Product quantization-based kvcache for long context llm inference, 2025. <a href=\"https:\/\/arxiv.org\/abs\/2407.12820\">https:\/\/arxiv.org\/abs\/2407.12820<\/a>.<\/p><\/li>\n\n\n\n<li><p>Zhang Y, Li F, Tang Y, et al. Optimizing llm queries in relational workloads. arXiv preprint arXiv:2403.05821, 2024.<\/p><\/li>\n\n\n\n<li><p>Zhao Y, Yang S, Zhu K, et al. Blendserve: Optimizing offline inference for auto-regressive large models with resource-aware batching, 2024. <a href=\"https:\/\/arxiv.org\/abs\/2411.16102\">https:\/\/arxiv.org\/abs\/2411.16102<\/a>.<\/p><\/li>\n\n\n\n<li><p>Zheng L, Yin L, Xie Z, et al. Sglang: Efficient execution of structured language model programs, 2024. <a href=\"https:\/\/arxiv.org\/abs\/2312.07104\">https:\/\/arxiv.org\/abs\/2312.07104<\/a>.<\/p><\/li>\n\n\n\n<li><p>Zhong Y, Liu S, Chen J, et al. Distserve: Disaggregating prefill and decoding for goodput-optimized large language model serving, 2024. <a href=\"https:\/\/arxiv.org\/abs\/2401.09670\">https:\/\/arxiv.org\/abs\/2401.09670<\/a>.<\/p><\/li>\n\n\n\n<li><p>Zhou Y, Chen Z, Mao Z, et al. An extensible software transport layer for gpu networking. arXiv preprint arXiv:2504.17307, 2025.<\/p><br><\/li>\n<\/ol>\n","protected":false},"excerpt":{"rendered":"<p>???Yihua Cheng ?Yuhan Liu ? Jiayi Yao * ?Yuwei An?Xiaokun Chen?Shaoting Feng ? Yuyang Huang?Samuel Shen?Kuntai Du?Junchen Jiang ???TensorMesh&amp;????? ?? ?????????LLM???????????????????????????????????????????????????????????KV Cache???????????????????????????? GPU ??????????????? ?????????????KV Cache?????????? LMCACHE???????????? KV Cache????????????????? LLM ?????vLLM ? SGLang???? KV Cache??????????????LMCACHE ? LLM ??????? KV Cache???????? LLM ??????token??????? KV Cache?????????????????????????????????????????????PD??????????????LMCACHE ???????????????????1?????? KV Cache????????????????????? I\/O ??????????2????? KV Cache??????? LMCACHE ??????????????3?????? API??????????????????????? GPU?CPU?????????????????????????LMCACHE [&hellip;]<\/p>\n","protected":false},"author":271290516,"featured_media":949,"comment_status":"open","ping_status":"open","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[],"tags":[],"class_list":["post-943","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry"],"_links":{"self":[{"href":"https:\/\/identia.digital\/lmcache\/wp-json\/wp\/v2\/posts\/943","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/identia.digital\/lmcache\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/identia.digital\/lmcache\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/identia.digital\/lmcache\/wp-json\/wp\/v2\/users\/271290516"}],"replies":[{"embeddable":true,"href":"https:\/\/identia.digital\/lmcache\/wp-json\/wp\/v2\/comments?post=943"}],"version-history":[{"count":0,"href":"https:\/\/identia.digital\/lmcache\/wp-json\/wp\/v2\/posts\/943\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/identia.digital\/lmcache\/wp-json\/wp\/v2\/media\/949"}],"wp:attachment":[{"href":"https:\/\/identia.digital\/lmcache\/wp-json\/wp\/v2\/media?parent=943"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/identia.digital\/lmcache\/wp-json\/wp\/v2\/categories?post=943"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/identia.digital\/lmcache\/wp-json\/wp\/v2\/tags?post=943"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}