jupyter notebook
magic commands
%load filename
将python代码加载进来,类似于bash的source。运行的效果是: 点一次运行:代码中的文本内容会在当前cell展开显示 再点一次运行:展开的代码会执行
install torch
打开文档,选择用pip的安装方法,别用conda, conda容易卡住。
https://pytorch.org/get-started/locally/
taming-transformers
git clone https://github.com/CompVis/taming-transformers
cd taming-transformers
conda env create -f environment.yaml
conda activate taming
pip install -e .
conda install cudatoolkit
import json
data = {
"name": "dean",
"age":30
}
print(json.dumps(data))
console.log("hello")
https://anaconda.org/nvidia/cuda-nvcc
conda install nvidia::cuda-toolkit
conda install nvidia/label/cuda-11.3.0::cuda-toolkit
conda install nvidia/label/cuda-11.3.1::cuda-toolkit
conda install nvidia/label/cuda-11.4.0::cuda-toolkit
conda install nvidia/label/cuda-11.4.1::cuda-toolkit
conda install nvidia/label/cuda-11.4.2::cuda-toolkit
conda install nvidia/label/cuda-11.4.3::cuda-toolkit
conda install nvidia/label/cuda-11.4.4::cuda-toolkit
conda install nvidia/label/cuda-11.5.0::cuda-toolkit
conda install nvidia/label/cuda-11.5.1::cuda-toolkit
conda install nvidia/label/cuda-11.5.2::cuda-toolkit
conda install nvidia/label/cuda-11.6.0::cuda-toolkit
conda install nvidia/label/cuda-11.6.1::cuda-toolkit
conda install nvidia/label/cuda-11.6.2::cuda-toolkit
conda install nvidia/label/cuda-11.7.0::cuda-toolkit
conda install nvidia/label/cuda-11.7.1::cuda-toolkit
conda install nvidia/label/cuda-11.8.0::cuda-toolkit
conda install nvidia/label/cuda-12.0.0::cuda-toolkit
conda install nvidia/label/cuda-12.0.1::cuda-toolkit
conda install nvidia/label/cuda-12.1.0::cuda-toolkit
conda install nvidia/label/cuda-12.1.1::cuda-toolkit
conda install nvidia/label/cuda-12.2.0::cuda-toolkit
conda install nvidia/label/cuda-12.2.1::cuda-toolkit
conda install nvidia/label/cuda-12.2.2::cuda-toolkit
conda install nvidia/label/cuda-12.3.0::cuda-toolkit
conda install nvidia/label/cuda-12.3.1::cuda-toolkit
conda install nvidia/label/cuda-12.3.2::cuda-toolkit
conda install nvidia/label/cuda-12.4.0::cuda-toolkit
conda install nvidia/label/cuda-12.4.1::cuda-toolkit
conda install nvidia/label/cuda-12.5.0::cuda-toolkit
conda install nvidia/label/cuda-12.5.1::cuda-toolkit
conda install nvcc
conda install nvidia::cuda-nvcc
conda install nvidia/label/cuda-11.3.0::cuda-nvcc
conda install nvidia/label/cuda-11.3.1::cuda-nvcc
conda install nvidia/label/cuda-11.4.0::cuda-nvcc
conda install nvidia/label/cuda-11.4.1::cuda-nvcc
conda install nvidia/label/cuda-11.4.2::cuda-nvcc
conda install nvidia/label/cuda-11.4.3::cuda-nvcc
conda install nvidia/label/cuda-11.4.4::cuda-nvcc
conda install nvidia/label/cuda-11.5.0::cuda-nvcc
conda install nvidia/label/cuda-11.5.1::cuda-nvcc
conda install nvidia/label/cuda-11.5.2::cuda-nvcc
conda install nvidia/label/cuda-11.6.0::cuda-nvcc
conda install nvidia/label/cuda-11.6.1::cuda-nvcc
conda install nvidia/label/cuda-11.6.2::cuda-nvcc
conda install nvidia/label/cuda-11.7.0::cuda-nvcc
conda install nvidia/label/cuda-11.7.1::cuda-nvcc
conda install nvidia/label/cuda-11.8.0::cuda-nvcc
conda install nvidia/label/cuda-12.0.0::cuda-nvcc
conda install nvidia/label/cuda-12.0.1::cuda-nvcc
conda install nvidia/label/cuda-12.1.0::cuda-nvcc
conda install nvidia/label/cuda-12.1.1::cuda-nvcc
conda install nvidia/label/cuda-12.2.0::cuda-nvcc
conda install nvidia/label/cuda-12.2.1::cuda-nvcc
conda install nvidia/label/cuda-12.2.2::cuda-nvcc
conda install nvidia/label/cuda-12.3.0::cuda-nvcc
conda install nvidia/label/cuda-12.3.1::cuda-nvcc
conda install nvidia/label/cuda-12.3.2::cuda-nvcc
conda install nvidia/label/cuda-12.4.0::cuda-nvcc
conda install nvidia/label/cuda-12.4.1::cuda-nvcc
conda install nvidia/label/cuda-12.5.0::cuda-nvcc
conda install nvidia/label/cuda-12.5.1::cuda-nvcc
conda environment
cuda build and execute
install cuda tool kit for compilation.
conda create -n myenv
conda install -c nvidia cuda cuda-nvcc
example code(test.cu):
#include <stdio.h>
// CUDA kernel to perform vector addition
__global__ void vectorAdd(int* a, int* b, int* c, int size) {
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < size) {
c[tid] = a[tid] + b[tid];
}
}
int main() {
int size = 1024; // Size of the vectors
int a[size], b[size], c[size]; // Input and output arrays
// Initialize input arrays
for (int i = 0; i < size; i++) {
a[i] = i;
b[i] = 2 * i;
}
// Declare GPU memory pointers
int *dev_a, *dev_b, *dev_c;
// Allocate GPU memory
cudaMalloc((void**)&dev_a, size * sizeof(int));
cudaMalloc((void**)&dev_b, size * sizeof(int));
cudaMalloc((void**)&dev_c, size * sizeof(int));
// Copy input arrays from host to GPU memory
cudaMemcpy(dev_a, a, size * sizeof(int), cudaMemcpyHostToDevice);
cudaMemcpy(dev_b, b, size * sizeof(int), cudaMemcpyHostToDevice);
// Define grid and block dimensions
int blockSize = 256;
int gridSize = (size + blockSize - 1) / blockSize;
// Launch CUDA kernel on the GPU
vectorAdd<<<gridSize, blockSize>>>(dev_a, dev_b, dev_c, size);
// Copy the result back from GPU to host memory
cudaMemcpy(c, dev_c, size * sizeof(int), cudaMemcpyDeviceToHost);
// Print the result
for (int i = 0; i < size; i++) {
printf("%d + %d = %d\n", a[i], b[i], c[i]);
}
// Free GPU memory
cudaFree(dev_a);
cudaFree(dev_b);
cudaFree(dev_c);
return 0;
}
build and run
nvcc test.cu -o example
./example
conda install pytorch and pytorch-cuda
nvidia version matching
nvidia kernel version, toolkit version must roughly match.
to see kernel version, use nvidia-smi
command:
nvidia-smi
Tue Apr 30 18:06:30 2024
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.171.04 Driver Version: 535.171.04 CUDA Version: 12.2 |
|-----------------------------------------+----------------------+----------------------+
| GPU Name Persistence-M | Bus-Id Disp.A | Volatile Uncorr. ECC |
| Fan Temp Perf Pwr:Usage/Cap | Memory-Usage | GPU-Util Compute M. |
| | | MIG M. |
|=========================================+======================+======================|
| 0 NVIDIA GeForce GTX 1080 Off | 00000000:01:00.0 On | N/A |
| 32% 45C P8 12W / 180W | 289MiB / 8192MiB | 9% Default |
| | | N/A |
+-----------------------------------------+----------------------+----------------------+
` CUDA Version: 12.2 ` is what we want to know.
to figure out cudatoolkit version, use nvcc --version
command:
nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2024 NVIDIA Corporation
Built on Thu_Mar_28_02:18:24_PDT_2024
Cuda compilation tools, release 12.4, V12.4.131
Build cuda_12.4.r12.4/compiler.34097967_0
cuda_12.4.r12.4
is what we want to know.
install pytorch
With pytorch
installed, you can run your model on cpu
. With pytorch-cuda
installed, you can run your model on cuda
.
Install pytorch:
conda install pytorch=2.2
or
conda install pytorch==2.2.2
Single equal sign indicates none exact match, while double equal sign means exact match.
To test if cuda is available:
python3
>>> import torch
>>> print(torch.cuda.is_available())
False
Install pytorch-cuda:
conda install pytorch-cuda=12.4 -c pytorch
The version better matches the cuda version you got using nvidia-smi
command.
Sometimes, the version you wanted does not exist, try some other versions as long as the major version matches:
conda search pytorch-cuda -c pytorch
Loading channels: done
# Name Version Build Channel
pytorch-cuda 11.6 h867d48c_0 pytorch
pytorch-cuda 11.6 h867d48c_1 pytorch
pytorch-cuda 11.7 h67b0de4_0 pytorch
pytorch-cuda 11.7 h67b0de4_1 pytorch
pytorch-cuda 11.7 h67b0de4_2 pytorch
pytorch-cuda 11.7 h778d358_3 pytorch
pytorch-cuda 11.7 h778d358_5 pytorch
pytorch-cuda 11.8 h7e8668a_3 pytorch
pytorch-cuda 11.8 h7e8668a_5 pytorch
pytorch-cuda 11.8 h8dd9ede_2 pytorch
pytorch-cuda 12.1 ha16c6d3_5 pytorch
pytorch-cuda 12.4 hc786d27_6 pytorch
For example, if your cuda version is 11.3, you may try 11.7 which has backward compatibility with version 11.3.
Sometimes, pytorch-cuda would have dependency conflicts with python version. so you may downgrade to an older version of python as well:
conda install python=3.8 pytorch-cuda=12.4 -c pytorch -v
better follow instructions here: https://pytorch.org/get-started/previous-versions/
vllm
安装
pip install vllm
运行
from vllm import LLM, SamplingParams
import time
#llm = LLM(model="Qwen/Qwen2-1.5B-Instruct") # Name or path of your model
#help(LLM)
path="./MiniCPM-2B-dpo-bf16"
llm = LLM(model=path, trust_remote_code=True,
max_model_len=4096) # Name or path of your model
help(LLM)
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=4096)
start = time.time()
output = llm.generate("详细介绍一下北京", sampling_params)
end = time.time()
duration = end - start
#print(output[0])
out_tokens = len(output[0].outputs[0].token_ids)
print(f"output_tokens:{out_tokens}, time:{duration}, output speed: {out_tokens/duration}")
start = time.time()
output = llm.generate("详细介绍一下北京", sampling_params)
end = time.time()
duration = end - start
#print(output[0])
out_tokens = len(output[0].outputs[0].token_ids)
print(f"output_tokens:{out_tokens}, time:{duration}, output speed: {out_tokens/duration}")
参数调节
显存占用:
llm2 = LLM(model="./MiniCPM-2B-128k", trust_remote_code=True,
gpu_memory_utilization=0.2,
max_model_len=8192)
0.2表示总共使用20%的显存。不设置情况下vllm会占用所有显存,用于扩充能支持的batch数。
nVidia显存占用量获取:
from pynvml import *
nvmlInit()
h = nvmlDeviceGetHandleByIndex(0)
info = nvmlDeviceGetMemoryInfo(h)
print(f'total : {info.total}')
print(f'free : {info.free}')
print(f'used : {info.used}')
流式输出
from vllm import LLM, SamplingParams
import time
from vllm import EngineArgs, LLMEngine, RequestOutput, SamplingParams
import asyncio
import json
import ssl
from argparse import Namespace
from typing import Any, AsyncGenerator, Optional
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response, StreamingResponse
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.launcher import serve_http
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
from vllm.usage.usage_lib import UsageContext
from vllm.utils import (random_uuid)
#path="/mnt/bn/znzx-public/models/gemma-2-2b-it"
#llm = LLM(model="Qwen/Qwen2-1.5B-Instruct") # Name or path of your model
#help(LLM)
path="/mnt/bn/znzx-public/models/MiniCPM-2B-dpo-bf16"
#help(LLM)
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=4096)
prompt = "详细介绍一下北京"
engine_args = AsyncEngineArgs(model=path, trust_remote_code=True, dtype="float16",
max_model_len=4096)
engine = LLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.API_SERVER)
#help(engine)
#results_generator = iterate_with_cancellation(
# results_generator, is_cancelled=False)
from transformers import AutoTokenizer
from vllm.inputs import (TextPrompt,
TokensPrompt)
tokenizer = AutoTokenizer.from_pretrained(path)
def xx(prompts):
for prompt in prompts:
history=[]
history.append({"role": "user", "content": prompt})
history_str = tokenizer.apply_chat_template(history, tokenize=False, add_generation_prompt=False)
inputs = tokenizer(history_str, return_tensors='pt')
ids = inputs["input_ids"].cpu()[0].tolist()
tp = TokensPrompt(prompt_token_ids = ids)
print(tp)
print(history_str)
start = time.time()
request_id = random_uuid()
engine.add_request(str(request_id), tp, sampling_params)
count = 0
while engine.has_unfinished_requests():
responses = engine.step()
count += 1
if count > 5:
break
print(responses)
for resp in responses:
print(f"req_id:{resp.request_id}, text:{resp.outputs[0].text}, len:{len(resp.outputs[0].token_ids)}")
print("----------------")
#async for token in results_generator:
# count+=1
# end = time.time()
# if count == 1 or token.finished :
# data = {
# "text": token.outputs[0].text,
# "token_ids":token.outputs[0].token_ids,
# }
# print(f'{data["text"][0:5]}, token_num:{len(data["token_ids"])}, time: {end-start}', end='', flush=True)
# yield data
# Using the sync generator in a for loop
#for item in sync_generator(xx(prompt)):
# print(item["text"], len(item["token_ids"]))
xx([prompt, "你好呀"])
#llm = LLM(model=path, trust_remote_code=True, dtype="float16",
# max_model_len=4096) # Name or path of your model
#start = time.time()
#output = llm.generate("详细介绍一下北京", sampling_params)
#end = time.time()
#duration = end - start
#print(output[0])
#out_tokens = len(output[0].outputs[0].token_ids)
#print(f"output_tokens:{out_tokens}, time:{duration}, output speed: {out_tokens/duration}")
#start = time.time()
#output = llm.generate("详细介绍一下北京", sampling_params)
#end = time.time()
#duration = end - start
#print(output[0])
#out_tokens = len(output[0].outputs[0].token_ids)
#print(f"output_tokens:{out_tokens}, time:{duration}, output speed: {out_tokens/duration}")
from vllm import LLM, SamplingParams
import time
import asyncio
import json
import ssl
from argparse import Namespace
from typing import Any, AsyncGenerator, Optional
from fastapi import FastAPI, Request
from fastapi.responses import JSONResponse, Response, StreamingResponse
from vllm.engine.arg_utils import AsyncEngineArgs
from vllm.engine.async_llm_engine import AsyncLLMEngine
from vllm.entrypoints.launcher import serve_http
from vllm.logger import init_logger
from vllm.sampling_params import SamplingParams
from vllm.usage.usage_lib import UsageContext
from vllm.utils import (random_uuid)
#path="/mnt/bn/znzx-public/models/gemma-2-2b-it"
#llm = LLM(model="Qwen/Qwen2-1.5B-Instruct") # Name or path of your model
#help(LLM)
path="/mnt/bn/znzx-public/models/MiniCPM-2B-dpo-bf16"
#help(LLM)
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=4096)
request_id = random_uuid()
prompt = "详细介绍一下北京"
engine_args = AsyncEngineArgs(model=path, trust_remote_code=True, dtype="float16",
max_model_len=4096)
engine = AsyncLLMEngine.from_engine_args(
engine_args, usage_context=UsageContext.API_SERVER)
#results_generator = iterate_with_cancellation(
# results_generator, is_cancelled=False)
async def xx(prompt):
start = time.time()
results_generator = engine.generate(prompt, sampling_params, request_id)
count = 0
async for token in results_generator:
count+=1
end = time.time()
if count == 1 or token.finished :
data = {
"text": token.text,
"token_ids":token.token_ids,
}
print(f"{data}, {end-start}", end='', flush=True)
yield data
def sync_generator_from_async(aiter):
loop = asyncio.get_event_loop()
async def inner():
async for item in aiter:
yield item
return inner()
asyncio.run(xx(prompt))
#llm = LLM(model=path, trust_remote_code=True, dtype="float16",
# max_model_len=4096) # Name or path of your model
#start = time.time()
#output = llm.generate("详细介绍一下北京", sampling_params)
#end = time.time()
#duration = end - start
#print(output[0])
#out_tokens = len(output[0].outputs[0].token_ids)
#print(f"output_tokens:{out_tokens}, time:{duration}, output speed: {out_tokens/duration}")
#start = time.time()
#output = llm.generate("详细介绍一下北京", sampling_params)
#end = time.time()
#duration = end - start
#print(output[0])
#out_tokens = len(output[0].outputs[0].token_ids)
#print(f"output_tokens:{out_tokens}, time:{duration}, output speed: {out_tokens/duration}")
other tricks
无Pytorch情况下只使用tokenizer
pip3 install transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen1.5-7B-Chat")
model_inputs = tokenizer(["hello, I am dean"])
print(model_inputs)
显示结果:
{'input_ids': [[14990, 11, 358, 1079, 72862]], 'attention_mask': [[1, 1, 1, 1, 1]]}
nvidia-cuda-pytorch trouble-shooting
Traceback (most recent call last):
File "./qwen_run_rich_text_pipeline.py", line 156, in <module>
extracted_formats = model_chat(extract_format_model, format_inputs, extract_format_system_prompt)
File "./qwen_run_rich_text_pipeline.py", line 72, in model_chat
input_ids.to(device),
File "/root/anaconda3/envs/qwen/lib/python3.8/site-packages/torch/cuda/__init__.py", line 293, in _lazy_init
torch._C._cuda_init()
RuntimeError: Unexpected error from cudaGetDeviceCount(). Did you run some cuda functions before calling NumCudaDevices() that might have already set an error? Error 803: system has unsupported display driver / cuda driver combination
解决办法:
echo $CUDA_HOME
# /usr/local/cuda
which nvcc
# /root/anaconda3/envs/qwen/bin/nvcc
export CUDA_HOME="/roo/anaconda3/envs/qwen/"
export LD_LIBRARY_PATH=/opt/tiger/native_libhdfs/lib/native:/opt/tiger/jdk/jdk8u265-b01/jre/lib/amd64/server:/opt/tiger/yarn_deploy/hadoop/lib/native:/opt/tiger/yarn_deploy/hadoop/lib/native/ufs:/opt/tiger/native_libhdfs/lib/native:/opt/tiger/jdk/jdk8u265-b01/jre/lib/amd64/server:/opt/tiger/yarn_deploy/hadoop/lib/native:/opt/tiger/yarn_deploy/hadoop/lib/native/ufs:/opt/tiger/yarn_deploy/hadoop/lib/native:/opt/tiger/yarn_deploy/hadoop_current/lib/native:/opt/tiger/yarn_deploy/hadoop_current/lzo/lib:/root/anaconda3/envs/qwen//lib
LD_LIBRARY_PATH关系非常大,里面文件夹的顺序也很重要。 特别是当你的系统里有多种方式安装的cuda runtime的时候,比如你linux的方式安装了一些,用conda的方式又在venv里安装了一些。这时最好只有一个发挥作用。