#get code
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp

#compilation
#GPU windows first install Visual studio then CUDA Toolkit
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release -j8

#iGPU windows first install Visual studio then oneAPI
./examples/sycl/build.sh
#

#usage cli
/dfs/data/LlamaCpp/llama.cpp-master/build/bin/llama-cli -m model.gguf --single-turn -cnv -fa -p "Tell me something about Beijing." -ngl 99
#usage server
##linux start model service
/dfs/data/LlamaCpp/llama.cpp-master/build/bin/llama-server -fa -m model.gguf -ngl 99 --ctx-size 8192 --predict 1024 --temp 0.8 --top-k 40 --top-p 0.9 --repeat-penalty 1.1 --rope-freq-base 500000

## cli api
curl --request POST \
    --url http://localhost:8080/completion \
    --header "Content-Type: application/json" \
    --data '{"prompt": "Tell me something about Beijing.","n_predict": 128}'
	
##windows

### in iGPU need oneAPI environment
####in cmd
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
####in powershell
cmd.exe "/K" '"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" && powershell'

llama.cpp\build\bin\llama-server.exe -m model.gguf -c 2048
curl --request POST --url http://localhost:8080/completion --header "Content-Type: application/json" --data "{\"prompt\": \"Tell me something about Beijing.\",\"n_predict\": 2048}"
	

#python env openai style
import openai

client = openai.OpenAI(
    base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
    api_key = "sk-no-key-required"
)

completion = client.chat.completions.create(
  model="gpt-3.5-turbo",
  messages=[
    {"role": "user", "content": "Tell me something about Beijing."}
  ],
  max_tokens=512
)

print(completion.choices[0].message)

#curl style
import subprocess,json

prompt="Tell me something about Beijing."
json_data={"prompt": prompt,"n_predict": 512}
#print(json.dumps(json_data))
cmd_line = ["curl", "--request", "POST", "--url", "http://localhost:8080/completion", "--header", "Content-Type: application/json", "--data", json.dumps(json_data)]
result = subprocess.run(cmd_line, capture_output=True, text=True)
print(result.stdout)

hf model to gguf

#hf model to fp16 gguf model
python llama.cpp-master/convert_hf_to_gguf.py Model_Path

offline quantization

llama-quantize model.gguf q4_0

Self trained QAT model or PTQ model to gguf format

It will get higher performance than official default quantization method. It is applied internal and is not open source for the time being.

深度学习 49

技术 1

感悟 1

编程语言 3

算法基础 1

软件使用 2

code 4

搭建环境 1

Linux使用 1

docker使用 2

医疗 1

python使用 1

llamacpp 1

CUDA 1

llamacpp usage

hf model to gguf

offline quantization

Self trained QAT model or PTQ model to gguf format