{"id":2064,"date":"2026-02-26T16:15:22","date_gmt":"2026-02-26T08:15:22","guid":{"rendered":"https:\/\/www.starverse-ai.com\/guide\/archives\/2064"},"modified":"2026-02-26T16:15:22","modified_gmt":"2026-02-26T08:15:22","slug":"%e7%a7%91%e7%a0%94%e5%9b%a2%e9%98%9f%e5%a6%82%e4%bd%95%e5%9c%a8-3-%e5%a4%a9%e5%86%85%e5%be%ae%e8%b0%83-70b-%e4%b8%ad%e6%96%87%e5%a4%a7%e6%a8%a1%e5%9e%8b%ef%bc%9f%e6%95%b0%e6%8d%ae%e9%9b%86%e7%ae%97","status":"publish","type":"post","link":"https:\/\/www.starverse-ai.com\/guide\/archives\/2064","title":{"rendered":"\u79d1\u7814\u56e2\u961f\u5982\u4f55\u5728 3 \u5929\u5185\u5fae\u8c03 70B \u4e2d\u6587\u5927\u6a21\u578b\uff1f\u6570\u636e\u96c6+\u7b97\u529b\u4e00\u7ad9\u5f0f\u653b\u7565"},"content":{"rendered":"<figure class=\"wp-block-image size-large\"><img decoding=\"async\" src=\"https:\/\/www.starverse-ai.com\/guide\/wp-content\/uploads\/2026\/02\/1772093721_55ccae.png\" alt=\"\u79d1\u7814\u56e2\u961f\u5982\u4f55\u5728 3 \u5929\u5185\u5fae\u8c03 70B \u4e2d\u6587\u5927\u6a21\u578b\uff1f\u6570\u636e\u96c6+\u7b97\u529b\u4e00\u7ad9\u5f0f\u653b\u7565\" style=\"display:block; margin:10px auto; max-width:100%; height:auto;\" \/><\/figure>\n<blockquote>\n<p>\u201c\u5927\u6a21\u578b\u5fae\u8c03\u4e00\u6b21\uff0c\u6570\u636e\u6e05\u6d17\u4e09\u5929\uff0c\u8bad\u7ec3\u4e09\u5929\uff0c\u8c03\u53c2\u4e09\u5929\uff0c\u6700\u540e\u53d1\u73b0\u9884\u7b97\u53ea\u5269\u4e09\u5929\u3002\u201d<br \/>\n\u2014\u2014 \u67d0 985 \u9ad8\u6821 NLP \u5b9e\u9a8c\u5ba4\u5410\u69fd\u5e16\uff0c\u70b9\u8d5e 1.2 \u4e07<\/p>\n<\/blockquote>\n<p>\u8fd9\u6761\u9ad8\u8d5e\u8bc4\u8bba\uff0c\u7cbe\u51c6\u6233\u4e2d\u4e86 2024 \u5e74\u79d1\u7814\u4eba\u7684\u96c6\u4f53\u75db\u70b9\uff1a70B \u53c2\u6570\u7ea7\u522b\u7684\u4e2d\u6587\u5927\u6a21\u578b\uff0c\u60f3\u4ece\u201c\u80fd\u7528\u201d\u5230\u201c\u597d\u7528\u201d\uff0c\u5fc5\u987b\u5728 7 \u5929\u5185\u5b8c\u6210\u6570\u636e\u53bb\u91cd\u3001\u683c\u5f0f\u5bf9\u9f50\u3001\u5206\u5e03\u5f0f\u8bad\u7ec3\u3001\u65ad\u70b9\u7eed\u8bad\u3001\u6307\u6807\u590d\u73b0\uff0c\u8fd8\u8981\u628a\u7ecf\u8d39\u9501\u5728 5 \u4e07\u5143\u4ee5\u5185\u3002\u65f6\u95f4\u3001\u7b97\u529b\u3001\u6570\u636e\u3001\u94b1\u5305\uff0c\u56db\u5ea7\u5927\u5c71\u540c\u65f6\u538b\u4e0b\u6765\uff0c\u5f88\u591a\u56e2\u961f\u8fd8\u6ca1\u5f00\u59cb\u5c31\u5148\u201c\u529d\u9000\u201d\u3002<\/p>\n<h2>\u4e09\u5ea7\u5927\u5c71\uff0c\u600e\u4e48\u4e00\u591c\u63a8\u5e73\uff1f<\/h2>\n<p>\u6708\u521d\uff0c\u4eba\u5927\u00d7\u4e2d\u79d1\u9662\u00d7\u6e2f\u4e2d\u6587\u8054\u5408\u56e2\u961f\u53d1\u5e03\u4e86\u4e00\u5219\u6280\u672f\u5e16\uff1a\u4ed6\u4eec\u5728<strong>3 \u5929\u5185\u5b8c\u6210 70B \u4e2d\u6587\u5927\u6a21\u578b\u5fae\u8c03<\/strong>\uff0c\u6700\u7ec8 Loss \u4ece 2.47 \u964d\u5230 1.65\uff0cC-Eval \u63d0\u5347 7.8 \u5206\u3002\u5e16\u5b50\u91cc\u53cd\u590d\u63d0\u5230\u4e00\u4e2a\u5e73\u53f0\u2014\u2014<a href=\"https:\/\/www.starverse-ai.com\">\u661f\u5b87\u667a\u7b97<\/a>\uff08GPU\u670d\u52a1\u5668\u79df\u7528\uff09\u3002\u62b1\u7740\u201c\u771f\u6709\u8fd9\u4e48\u795e\uff1f\u201d\u7684\u7591\u95ee\uff0c\u6211\u4eec\u8fd8\u539f\u4e86\u4ed6\u4eec\u7684\u5b8c\u6574\u6d41\u7a0b\uff0c\u53d1\u73b0\u8fd9\u5957\u201c\u79d1\u7814\u52a0\u901f\u5305\u201d\u786e\u5b9e\u628a\u56db\u5ea7\u5927\u5c71\u53d8\u6210\u4e86\u56db\u4e2a\u201c\u4e00\u952e\u6309\u94ae\u201d\u3002<\/p>\n<hr \/>\n<h2>\u75db\u70b9\u62c6\u89e3\uff1a\u4e3a\u4ec0\u4e48 70B \u5fae\u8c03\u603b\u662f\u201c\u6162+\u8d35+\u8106\u201d\uff1f<\/h2>\n<table>\n<thead>\n<tr>\n<th>\u73af\u8282<\/th>\n<th>\u4f20\u7edf\u505a\u6cd5<\/th>\n<th>\u5e38\u89c1\u7ffb\u8f66\u70b9<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td>\u6570\u636e\u6e05\u6d17<\/td>\n<td>\u81ea\u5df1\u5199\u811a\u672c\u53bb\u91cd\u3001\u8fc7\u6ee4\u3001\u5206\u8bcd<\/td>\n<td>\u91cd\u590d\u6587\u672c\u6f0f\u5220\uff0c\u8bad\u7ec3 12 \u5c0f\u65f6\u540e Loss \u53cd\u5f39<\/td>\n<\/tr>\n<tr>\n<td>\u5206\u5e03\u5f0f\u8bad\u7ec3<\/td>\n<td>\u624b\u642d Megatron+DeepSpeed<\/td>\n<td>NCCL \u62a5\u9519\u3001\u8282\u70b9\u6389\u7ebf\uff0c\u4e00\u665a\u56de\u5230\u89e3\u653e\u524d<\/td>\n<\/tr>\n<tr>\n<td>\u65ad\u70b9\u7eed\u8bad<\/td>\n<td>\u81ea\u5df1\u5199 Checkpoint \u4e0a\u4f20\u903b\u8f91<\/td>\n<td>\u624b\u6ed1 ^C\uff0c4000 \u5143\u5361\u65f6\u76f4\u63a5\u84b8\u53d1<\/td>\n<\/tr>\n<tr>\n<td>\u7ecf\u8d39\u63a7\u5236<\/td>\n<td>\u5305\u6708 GPU \u4e91\u4e3b\u673a<\/td>\n<td>\u95f2\u65f6\u7a7a\u7f6e\u3001\u5fd9\u65f6\u6392\u961f\uff0c\u9884\u7b97\u82b1\u5f97\u201c\u4e0d\u9971\u548c\u201d<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<hr \/>\n<h2>\u661f\u5b87\u667a\u7b97\u300c\u79d1\u7814\u52a0\u901f\u5305\u300d\uff1a\u628a 30 \u5929\u538b\u7f29\u6210 3 \u5929<\/h2>\n<h3>1. \u6570\u636e\uff1a1 TB \u9ad8\u8d28\u91cf\u4e2d\u6587\u8bed\u6599\u76f4\u63a5\u6302\u8f7d<\/h3>\n<p>\u5e73\u53f0\u516c\u5171\u6570\u636e\u96c6\u6c60\u5df2\u5185\u7f6e<strong>1 TB \u7ecf\u8fc7\u53bb\u91cd\u3001\u654f\u611f\u8fc7\u6ee4\u3001\u8d28\u91cf\u6253\u5206\u7684\u4e2d\u6587\u8bed\u6599<\/strong>\uff0c\u8986\u76d6\u767e\u79d1\u3001\u95ee\u7b54\u3001\u65b0\u95fb\u3001\u79d1\u6280\u8bba\u6587\u3001\u91d1\u878d\u62a5\u544a 5 \u5927\u573a\u666f\u3002\u7528\u6237\u65e0\u9700\u518d\u4ece HuggingFace\u3001\u7f51\u76d8\u201c\u62d6\u201d\u6570\u636e\uff0c<strong>\u4e0d\u5360\u7528\u4e2a\u4eba\u78c1\u76d8\u914d\u989d<\/strong>\uff0c\u76f4\u63a5 <code>ln -s<\/code> \u5230\u8bad\u7ec3\u5bb9\u5668\u5373\u53ef\u8c03\u7528\u3002\u7701\u4e0b\u7684\u4e0d\u4ec5\u662f\u4e0b\u8f7d\u65f6\u95f4\uff0c\u8fd8\u6709<strong>\u6bcf GB 0.15 \u5143\/\u6708\u7684\u5b58\u50a8\u8d39<\/strong>\u3002<\/p>\n<h3>2. \u955c\u50cf\uff1aMegatron-LM + DeepSpeed \u53cc\u955c\u50cf\uff0cNCCL \u5df2\u8c03\u901a<\/h3>\n<p>\u5b98\u65b9\u63d0\u4f9b\u4e24\u5957\u5f00\u7bb1\u955c\u50cf\uff1a<br \/>\n&#8211; <code>megatron-ngc:23.06-py3<\/code><br \/>\n&#8211; <code>deepspeed:0.12.1-torch2.1-cuda12.1<\/code><br \/>\n\u5747\u5df2\u9884\u88c5 <code>flash-attn<\/code>\u3001<code>apex<\/code>\u3001<code>ftfy<\/code>\u3001<code>jieba_fast<\/code>\uff0c\u5e76\u5199\u597d 3D \u5e76\u884c\uff08TP=8, PP=4, DP=4\uff09\u793a\u4f8b\u811a\u672c\u3002\u7528\u6237\u53ea\u9700\u628a\u79c1\u6709\u8bed\u6599 <code>scp<\/code> \u5230 <code>\/workspace\/data<\/code>\uff0c\u6267\u884c <code>bash pretrain_megatron.sh<\/code> \u5373\u53ef\u62c9\u8d77\u8bad\u7ec3\uff0c<strong>\u96f6 NCCL \u62a5\u9519<\/strong>\u3002<\/p>\n<h3>3. \u7b97\u529b\uff1a8\u00d7A100 80G NVLink \u8282\u70b9\uff0c\u4f4e\u81f3 2.1 \u5143\/\u5361\/\u65f6<\/h3>\n<p>\u5e73\u53f0\u91c7\u7528<strong>\u6309\u5c0f\u65f6\u8ba1\u8d39<\/strong>\u7684 GPU\u670d\u52a1\u5668\u79df\u7528 \u6a21\u5f0f\uff0c\u652f\u6301 1\uff5e8 \u8282\u70b9\u5f39\u6027\u4f38\u7f29\u3002\u4ee5 70B \u6a21\u578b\u4e3a\u4f8b\uff0cFP16+Zero-3 \u9700\u8981 64 \u5f20 A100\uff0c<strong>\u6ee1\u8d1f\u8377 72 \u5c0f\u65f6<\/strong>\u4ec5\u9700<br \/>\n64 \u00d7 2.1 \u00d7 72 = <strong>9 676.8 \u5143<\/strong><br \/>\n\u76f8\u6bd4\u4f20\u7edf\u5305\u6708\u65b9\u6848\u8282\u7701 42%\uff0c\u4e14<strong>\u968f\u65f6\u53ef\u91ca\u653e<\/strong>\uff0c\u4e0d\u4f1a\u51fa\u73b0\u201c\u673a\u5668\u7a7a\u8dd1\u3001\u94b1\u5305\u6d41\u8840\u201d\u3002<\/p>\n<h3>4. \u6d41\u7a0b\uff1a4 \u6b65\u6d41\u6c34\u7ebf\uff0c30 \u5206\u949f\u5b8c\u6210\u4e0a\u4f20\u2192\u8bad\u7ec3<\/h3>\n<pre><code class=\"language-bash\"># \u2460 \u4e0a\u4f20\u79c1\u6709\u8bed\u6599\nscp -r my_corpus\/ root@node0:\/workspace\/data\n\n# \u2461 \u5e73\u53f0\u81ea\u52a8\u53bb\u91cd\npython \/scripts\/auto_dedup.py --input \/workspace\/data --output \/workspace\/data_dedup\n\n# \u2462 \u62c6\u5206\u8bad\u7ec3\/\u9a8c\u8bc1\npython \/scripts\/split_train_val.py --ratio 0.98\n\n# \u2463 \u542f\u52a8 3D \u5e76\u884c\nbash \/examples\/megatron_70b_sft.sh\n<\/code><\/pre>\n<p>\u5168\u7a0b Web \u7aef\u53ef\u89c6\uff0c<strong>TensorBoard \u4e0e WandB \u53cc\u901a\u9053\u5b9e\u65f6\u5237\u65b0<\/strong>\uff0cLoss\u3001Perplexity\u3001GPU \u5229\u7528\u7387\u4e00\u5c4f\u638c\u63e1\u3002<\/p>\n<h3>5. \u65ad\u70b9\u7eed\u8bad\uff1aCheckpoint \u81ea\u52a8\u540c\u6b65\u5230\u5bf9\u8c61\u5b58\u50a8<\/h3>\n<p>\u6bcf 1000 step \u81ea\u52a8\u89e6\u53d1 <code>torch.save()<\/code>\uff0c\u5e76\u5f02\u6b65\u4e0a\u4f20\u81f3\u661f\u5b87\u5bf9\u8c61\u5b58\u50a8\uff08\u517c\u5bb9 S3 \u534f\u8bae\uff09\u3002\u8282\u70b9\u610f\u5916\u6389\u7ebf\uff1f\u65b0\u8282\u70b9\u62c9\u8d77\u540e\u6267\u884c  <\/p>\n<pre><code class=\"language-bash\">aws s3 cp s3:\/\/starverse-checkpoint\/latest.ckpt \/workspace\/checkpoint\/\n<\/code><\/pre>\n<p><strong>10 \u79d2\u6062\u590d\u8bad\u7ec3<\/strong>\uff0c\u4e0d\u518d\u201c\u767d\u7ed9\u201d 4000 \u5143\u3002<\/p>\n<h3>6. \u8bba\u6587\u52a9\u653b\uff1a\u4e00\u952e\u5bfc\u51fa\u201c\u53ef\u590d\u73b0\u6750\u6599\u5305\u201d<\/h3>\n<p>\u5b9e\u9a8c\u7ed3\u675f\uff0c\u5e73\u53f0\u81ea\u52a8\u751f\u6210 <code>reproduce.zip<\/code>\uff0c\u5185\u542b\uff1a<br \/>\n&#8211; \u5b8c\u6574\u4ee3\u7801 commit id<br \/>\n&#8211; \u73af\u5883 requirements.txt<br \/>\n&#8211; \u8bad\u7ec3\u65e5\u5fd7\u4e0e TensorBoard \u4e8b\u4ef6\u6587\u4ef6<br \/>\n&#8211; \u6bcf\u4e2a Checkpoint \u7684 md5 \u503c<br \/>\n\u5ba1\u7a3f\u4eba\u8981\u6c42\u201c\u590d\u73b0\u201d\uff1f\u76f4\u63a5\u628a\u538b\u7f29\u5305\u4e0a\u4f20\u5230\u8865\u5145\u6750\u6599\uff0c<strong>\u8282\u7701 2 \u5929\u6574\u7406\u65f6\u95f4<\/strong>\u3002<\/p>\n<hr \/>\n<h2>\u5b9e\u6218\u7ed3\u679c\uff1a70B \u6a21\u578b 3 \u5929\u6536\u655b\uff0cLoss\u21930.82<\/h2>\n<table>\n<thead>\n<tr>\n<th>\u6307\u6807<\/th>\n<th>Day 0<\/th>\n<th>Day 1<\/th>\n<th>Day 2<\/th>\n<th>Day 3<\/th>\n<\/tr>\n<\/thead>\n<tbody>\n<tr>\n<td>Train Loss<\/td>\n<td>2.47<\/td>\n<td>2.01<\/td>\n<td>1.79<\/td>\n<td>1.65<\/td>\n<\/tr>\n<tr>\n<td>C-Eval<\/td>\n<td>38.4<\/td>\n<td>42.1<\/td>\n<td>44.7<\/td>\n<td>46.2<\/td>\n<\/tr>\n<tr>\n<td>GPU \u5229\u7528\u7387<\/td>\n<td>\u2014<\/td>\n<td>97.8 %<\/td>\n<td>98.1 %<\/td>\n<td>97.9 %<\/td>\n<\/tr>\n<\/tbody>\n<\/table>\n<p>\u5168\u7a0b 64 \u5361 A100 \u65e0\u95f4\u65ad\uff0c<strong>\u8bad\u7ec3 72 \u5c0f\u65f6\uff0cCheckpoint 12 \u4e2a<\/strong>\uff0c\u6700\u7ec8\u6a21\u578b\u5df2\u4e0a\u4f20\u81f3 HuggingFace\uff08repo: <code>StarLM-70B-Chinese-SFT<\/code>\uff09\uff0c\u7d2f\u8ba1\u4e0b\u8f7d 3.7 k\u3002<\/p>\n<hr \/>\n<h2>\u65b0\u7528\u6237\u798f\u5229\uff1a10 \u5143\u4f53\u9a8c\u91d1\uff0c0 \u6210\u672c\u8dd1\u901a 7B \u6a21\u578b<\/h2>\n<p>\u5373\u65e5\u8d77\u81f3 6 \u6708 30 \u65e5\uff0c\u65b0\u6ce8\u518c<a href=\"https:\/\/www.starverse-ai.com\">\u661f\u5b87\u667a\u7b97<\/a>\u8d26\u53f7\uff0c<strong>\u81ea\u52a8\u5230\u8d26 10 \u5143\u4f53\u9a8c\u91d1<\/strong>\uff0c\u53ef\u65e0\u635f\u4f53\u9a8c\uff1a<br \/>\n&#8211; 1\u00d7RTX 4090 24G \u5361 5 \u5c0f\u65f6<br \/>\n&#8211; \u6216 2\u00d7A100 40G \u5361 1 \u5c0f\u65f6<br \/>\n\u8db3\u591f\u8dd1\u901a 7B \u6a21\u578b\u5fae\u8c03\u5168\u6d41\u7a0b\u3002\u8001\u7528\u6237\u9080\u8bf7\u597d\u53cb\u518d\u5f97 20 \u5143\uff0c<strong>\u4e0a\u4e0d\u5c01\u9876<\/strong>\u3002<\/p>\n<hr \/>\n<h2>\u5199\u5728\u6700\u540e<\/h2>\n<p>\u5f53\u201c\u5927\u6a21\u578b\u201d\u5377\u5230 70B\u3001130B\uff0c\u751a\u81f3 200B\uff0c\u79d1\u7814\u7ade\u4e89\u65e9\u5df2\u4e0d\u53ea\u662f\u7b97\u6cd5\u521b\u65b0\uff0c\u800c\u662f<strong>\u6570\u636e\u3001\u7b97\u529b\u3001\u5de5\u7a0b\u5316<\/strong>\u7684\u7efc\u5408\u6bd4\u62fc\u3002\u661f\u5b87\u667a\u7b97\u7528\u4e00\u7ad9\u5f0f\u201cGPU\u670d\u52a1\u5668\u79df\u7528 + \u6570\u636e\u96c6 + \u955c\u50cf + \u8fd0\u7ef4\u201d\u7ec4\u5408\u62f3\uff0c\u628a 30 \u5929\u7684\u4f53\u529b\u6d3b\u538b\u7f29\u6210 3 \u5929\u7684\u201c\u70b9\u51fb-next\u201d\uff0c\u8ba9\u79d1\u7814\u4eba\u5458\u628a\u6709\u9650\u7ecf\u8d39\u771f\u6b63\u82b1\u5728\u201c\u60f3\u6cd5\u201d\u4e0a\u3002  <\/p>\n<p>\u5982\u679c\u4f60\u4e5f\u5728\u4e3a\u201c\u5927\u6a21\u578b\u5fae\u8c03\u6162\u3001GPU\u4e91\u4e3b\u673a\u8d35\u3001AI\u5e94\u7528\u843d\u5730\u96be\u201d\u800c\u5934\u75bc\uff0c\u4e0d\u59a8\u9886\u53d6 10 \u5143\u4f53\u9a8c\u91d1\uff0c<strong>\u4eca\u5929\u6ce8\u518c\uff0c\u660e\u5929\u51fa\u6a21\u578b<\/strong>\u3002\u661f\u5b87\u667a\u7b97\u5df2\u5907\u597d 64 \u5f20 A100\uff0c\u7b49\u4f60\u628a\u4e0b\u4e00\u4e2a 70B \u4e2d\u6587\u5927\u6a21\u578b\uff0c\u9001\u8fdb C-Eval \u6392\u884c\u699c\u524d\u5341\u3002<\/p>\n","protected":false},"excerpt":{"rendered":"<p>\u201c\u5927\u6a21\u578b\u5fae\u8c03\u4e00\u6b21\uff0c\u6570\u636e\u6e05\u6d17\u4e09\u5929\uff0c\u8bad\u7ec3\u4e09\u5929\uff0c\u8c03\u53c2\u4e09\u5929\uff0c\u6700\u540e\u53d1\u73b0&hellip;<\/p>\n","protected":false},"author":2,"featured_media":2063,"comment_status":"","ping_status":"","sticky":false,"template":"","format":"standard","meta":{"footnotes":""},"categories":[1],"tags":[],"class_list":["post-2064","post","type-post","status-publish","format-standard","has-post-thumbnail","hentry","category-zixun"],"views":49,"_links":{"self":[{"href":"https:\/\/www.starverse-ai.com\/guide\/wp-json\/wp\/v2\/posts\/2064","targetHints":{"allow":["GET"]}}],"collection":[{"href":"https:\/\/www.starverse-ai.com\/guide\/wp-json\/wp\/v2\/posts"}],"about":[{"href":"https:\/\/www.starverse-ai.com\/guide\/wp-json\/wp\/v2\/types\/post"}],"author":[{"embeddable":true,"href":"https:\/\/www.starverse-ai.com\/guide\/wp-json\/wp\/v2\/users\/2"}],"replies":[{"embeddable":true,"href":"https:\/\/www.starverse-ai.com\/guide\/wp-json\/wp\/v2\/comments?post=2064"}],"version-history":[{"count":0,"href":"https:\/\/www.starverse-ai.com\/guide\/wp-json\/wp\/v2\/posts\/2064\/revisions"}],"wp:featuredmedia":[{"embeddable":true,"href":"https:\/\/www.starverse-ai.com\/guide\/wp-json\/wp\/v2\/media\/2063"}],"wp:attachment":[{"href":"https:\/\/www.starverse-ai.com\/guide\/wp-json\/wp\/v2\/media?parent=2064"}],"wp:term":[{"taxonomy":"category","embeddable":true,"href":"https:\/\/www.starverse-ai.com\/guide\/wp-json\/wp\/v2\/categories?post=2064"},{"taxonomy":"post_tag","embeddable":true,"href":"https:\/\/www.starverse-ai.com\/guide\/wp-json\/wp\/v2\/tags?post=2064"}],"curies":[{"name":"wp","href":"https:\/\/api.w.org\/{rel}","templated":true}]}}