#get code
git clone https://github.com/ggml-org/llama.cpp
cd llama.cpp
#compilation
#GPU windows first install Visual studio then CUDA Toolkit
cmake -B build -DGGML_CUDA=ON
cmake --build build --config Release -j8
#iGPU windows first install Visual studio then oneAPI
./examples/sycl/build.sh
#
#usage cli
/dfs/data/LlamaCpp/llama.cpp-master/build/bin/llama-cli -m model.gguf --single-turn -cnv -fa -p "Tell me something about Beijing." -ngl 99
#usage server
##linux start model service
/dfs/data/LlamaCpp/llama.cpp-master/build/bin/llama-server -fa -m model.gguf -ngl 99 --ctx-size 8192 --predict 1024 --temp 0.8 --top-k 40 --top-p 0.9 --repeat-penalty 1.1 --rope-freq-base 500000
## cli api
curl --request POST \
--url http://localhost:8080/completion \
--header "Content-Type: application/json" \
--data '{"prompt": "Tell me something about Beijing.","n_predict": 128}'
##windows
### in iGPU need oneAPI environment
####in cmd
"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" intel64
####in powershell
cmd.exe "/K" '"C:\Program Files (x86)\Intel\oneAPI\setvars.bat" && powershell'
llama.cpp\build\bin\llama-server.exe -m model.gguf -c 2048
curl --request POST --url http://localhost:8080/completion --header "Content-Type: application/json" --data "{\"prompt\": \"Tell me something about Beijing.\",\"n_predict\": 2048}"
#python env openai style
import openai
client = openai.OpenAI(
base_url="http://localhost:8080/v1", # "http://<Your api-server IP>:port"
api_key = "sk-no-key-required"
)
completion = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Tell me something about Beijing."}
],
max_tokens=512
)
print(completion.choices[0].message)
#curl style
import subprocess,json
prompt="Tell me something about Beijing."
json_data={"prompt": prompt,"n_predict": 512}
#print(json.dumps(json_data))
cmd_line = ["curl", "--request", "POST", "--url", "http://localhost:8080/completion", "--header", "Content-Type: application/json", "--data", json.dumps(json_data)]
result = subprocess.run(cmd_line, capture_output=True, text=True)
print(result.stdout)
hf model to gguf
#hf model to fp16 gguf model
python llama.cpp-master/convert_hf_to_gguf.py Model_Path
offline quantization
llama-quantize model.gguf q4_0
Self trained QAT model or PTQ model to gguf format
It will get higher performance than official default quantization method. It is applied internal and is not open source for the time being.