Skip to content

Commit 036f682

Browse files
committed
Merge branch 'master' into xsn/server_mtmd
2 parents 574d403 + 27aa259 commit 036f682

File tree

12 files changed

+275
-154
lines changed

12 files changed

+275
-154
lines changed

.github/workflows/build.yml

+26-45
Original file line numberDiff line numberDiff line change
@@ -771,7 +771,7 @@ jobs:
771771
uses: hendrikmuhs/[email protected]
772772
with:
773773
key: windows-msys2
774-
variant: sccache
774+
variant: ccache
775775
evict-old-files: 1d
776776

777777
- name: Setup ${{ matrix.sys }}
@@ -814,26 +814,18 @@ jobs:
814814
strategy:
815815
matrix:
816816
include:
817-
- build: 'noavx-x64'
818-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX=OFF -DGGML_AVX2=OFF -DGGML_FMA=OFF'
819-
- build: 'avx2-x64'
820-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON'
821-
- build: 'avx-x64'
822-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX2=OFF'
823-
- build: 'avx512-x64'
824-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_AVX512=ON'
817+
- build: 'cpu-x64'
818+
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF'
825819
- build: 'openblas-x64'
826-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
827-
- build: 'kompute-x64'
828-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
820+
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_BLAS=ON -DGGML_BLAS_VENDOR=OpenBLAS -DBLAS_INCLUDE_DIRS="$env:RUNNER_TEMP/openblas/include" -DBLAS_LIBRARIES="$env:RUNNER_TEMP/openblas/lib/openblas.lib"'
829821
- build: 'vulkan-x64'
830-
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_VULKAN=ON'
822+
defines: '-DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_VULKAN=ON'
831823
- build: 'llvm-arm64'
832824
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
833-
- build: 'msvc-arm64'
834-
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-msvc.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON'
835825
- build: 'llvm-arm64-opencl-adreno'
836826
defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/arm64-windows-llvm.cmake -DCMAKE_PREFIX_PATH="$env:RUNNER_TEMP/opencl-arm64-release" -DGGML_OPENCL=ON -DGGML_OPENCL_USE_ADRENO_KERNELS=ON'
827+
# - build: 'kompute-x64'
828+
# defines: '-G "Ninja Multi-Config" -D CMAKE_TOOLCHAIN_FILE=cmake/x64-windows-llvm.cmake -DGGML_NATIVE=OFF -DLLAMA_BUILD_SERVER=ON -DGGML_RPC=ON -DGGML_BACKEND_DL=ON -DGGML_CPU_ALL_VARIANTS=ON -DGGML_OPENMP=OFF -DGGML_KOMPUTE=ON -DKOMPUTE_OPT_DISABLE_VULKAN_VERSION_CHECK=ON'
837829

838830
steps:
839831
- name: Clone
@@ -846,7 +838,7 @@ jobs:
846838
uses: hendrikmuhs/[email protected]
847839
with:
848840
key: windows-latest-cmake-${{ matrix.build }}
849-
variant: sccache
841+
variant: ccache
850842
evict-old-files: 1d
851843

852844
- name: Clone Kompute submodule
@@ -922,39 +914,26 @@ jobs:
922914
cp $env:RUNNER_TEMP/openblas/bin/libopenblas.dll ./build/bin/Release/openblas.dll
923915
cp $env:RUNNER_TEMP/OpenBLAS.LICENSE.txt ./build/bin/Release/OpenBLAS-${env:OPENBLAS_VERSION}.txt
924916
925-
- name: Check AVX512F support
926-
id: check_avx512f
927-
if: ${{ matrix.build == 'avx512-x64' }}
928-
continue-on-error: true
929-
run: |
930-
cd build
931-
$vcdir = $(vswhere -latest -products * -requires Microsoft.VisualStudio.Component.VC.Tools.x86.x64 -property installationPath)
932-
$msvc = $(join-path $vcdir $('VC\Tools\MSVC\'+$(gc -raw $(join-path $vcdir 'VC\Auxiliary\Build\Microsoft.VCToolsVersion.default.txt')).Trim()))
933-
$cl = $(join-path $msvc 'bin\Hostx64\x64\cl.exe')
934-
echo 'int main(void){unsigned int a[4];__cpuid(a,7);return !(a[1]&65536);}' >> avx512f.c
935-
& $cl /O2 /GS- /kernel avx512f.c /link /nodefaultlib /entry:main
936-
.\avx512f.exe && echo "AVX512F: YES" && ( echo HAS_AVX512F=1 >> $env:GITHUB_ENV ) || echo "AVX512F: NO"
937-
938917
- name: Test
939918
id: cmake_test
940-
# not all machines have native AVX-512
941-
if: ${{ matrix.build != 'msvc-arm64' && matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' && matrix.build != 'kompute-x64' && matrix.build != 'vulkan-x64' && (matrix.build != 'avx512-x64' || env.HAS_AVX512F == '1') }}
919+
if: ${{ matrix.build != 'llvm-arm64' && matrix.build != 'llvm-arm64-opencl-adreno' }}
942920
run: |
943921
cd build
944922
ctest -L main -C Release --verbose --timeout 900
945923
946-
- name: Test (Intel SDE)
947-
id: cmake_test_sde
948-
if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
949-
run: |
950-
curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
951-
# for some weird reason windows tar doesn't like sde tar.xz
952-
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
953-
7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
954-
$sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
955-
cd build
956-
$env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
957-
& $sde -future -- ctest -L main -C Release --verbose --timeout 900
924+
# TODO: disabled for now, consider adding tests for all CPU variants instead
925+
# - name: Test (Intel SDE)
926+
# id: cmake_test_sde
927+
# if: ${{ matrix.build == 'avx512-x64' && env.HAS_AVX512F == '0' }} # use Intel SDE for AVX-512 emulation
928+
# run: |
929+
# curl.exe -o $env:RUNNER_TEMP/sde.tar.xz -L "https://downloadmirror.intel.com/813591/sde-external-${env:SDE_VERSION}-win.tar.xz"
930+
# # for some weird reason windows tar doesn't like sde tar.xz
931+
# 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar.xz
932+
# 7z x "-o${env:RUNNER_TEMP}" $env:RUNNER_TEMP/sde.tar
933+
# $sde = $(join-path $env:RUNNER_TEMP sde-external-${env:SDE_VERSION}-win/sde.exe)
934+
# cd build
935+
# $env:LLAMA_SKIP_TESTS_SLOW_ON_EMULATOR = 1
936+
# & $sde -future -- ctest -L main -C Release --verbose --timeout 900
958937

959938
- name: Determine tag name
960939
id: tag
@@ -1039,7 +1018,7 @@ jobs:
10391018
uses: hendrikmuhs/[email protected]
10401019
with:
10411020
key: ${{ github.job }}-${{ matrix.cuda }}-${{ matrix.build }}
1042-
variant: sccache
1021+
variant: ccache
10431022
evict-old-files: 1d
10441023

10451024
- name: Install Cuda Toolkit 11.7
@@ -1117,6 +1096,8 @@ jobs:
11171096
cmake -S . -B build -G "Ninja Multi-Config" ^
11181097
-DLLAMA_BUILD_SERVER=ON ^
11191098
-DGGML_NATIVE=OFF ^
1099+
-DGGML_BACKEND_DL=ON ^
1100+
-DGGML_CPU_ALL_VARIANTS=ON ^
11201101
-DGGML_CUDA=ON ^
11211102
-DGGML_RPC=ON ^
11221103
-DCURL_LIBRARY="%CURL_PATH%/lib/libcurl.dll.a" -DCURL_INCLUDE_DIR="%CURL_PATH%/include"
@@ -1191,7 +1172,7 @@ jobs:
11911172
uses: hendrikmuhs/[email protected]
11921173
with:
11931174
key: windows-latest-cmake-sycl
1194-
variant: sccache
1175+
variant: ccache
11951176
evict-old-files: 1d
11961177

11971178
- name: Install

cmake/arm64-windows-msvc.cmake

-6
This file was deleted.

cmake/x64-windows-llvm.cmake

-6
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,3 @@ set( CMAKE_SYSTEM_PROCESSOR x86_64 )
33

44
set( CMAKE_C_COMPILER clang )
55
set( CMAKE_CXX_COMPILER clang++ )
6-
7-
set( arch_c_flags "-march=native" )
8-
9-
set( CMAKE_C_FLAGS_INIT "${arch_c_flags}" )
10-
set( CMAKE_CXX_FLAGS_INIT "${arch_c_flags}" )
11-

ggml/src/ggml-cpu/ggml-cpu-quants.c

+111
Original file line numberDiff line numberDiff line change
@@ -6596,7 +6596,118 @@ void ggml_vec_dot_q3_K_q8_K(int n, float * GGML_RESTRICT s, size_t bs, const voi
65966596
}
65976597

65986598
*s = hsum_float_8(acc);
6599+
#elif defined(__VXE__) || defined(__VXE2__)
6600+
uint32_t aux[3];
6601+
uint32_t utmp[4];
6602+
6603+
const int32x4_t v_z = vec_splat_s32(0);
6604+
const uint8x16_t v_3m = vec_splat_u8(0x03);
6605+
6606+
const uint8x16_t v_0c = vec_splat_u8(1);
6607+
const uint8x16_t v_1c = vec_sl(v_0c, 1);
6608+
const uint8x16_t v_2c = vec_sl(v_0c, 2);
6609+
const uint8x16_t v_3c = vec_sl(v_0c, 3);
6610+
6611+
uint8x16_t q3h[4];
6612+
uint8x16_t q3b[2];
6613+
int8x16_t q3bytes[4];
6614+
int8x16_t q8bytes[4];
6615+
uint8x16_t qhbits[2];
6616+
6617+
float sum = 0;
6618+
6619+
for (int i = 0; i < nb; ++i) {
6620+
const float d = y[i].d * GGML_FP16_TO_FP32(x[i].d);
65996621

6622+
const uint8_t * restrict x0l = x[i].qs;
6623+
const uint8_t * restrict x0h = x[i].hmask;
6624+
const int8_t * restrict y0 = y[i].qs;
6625+
6626+
qhbits[0] = vec_xl(0 , x0h);
6627+
qhbits[1] = vec_xl(16, x0h);
6628+
6629+
int32_t isum = 0;
6630+
6631+
memcpy(aux, x[i].scales, 12);
6632+
utmp[3] = ((aux[1] >> 4) & kmask2) | (((aux[2] >> 6) & kmask1) << 4);
6633+
utmp[2] = ((aux[0] >> 4) & kmask2) | (((aux[2] >> 4) & kmask1) << 4);
6634+
utmp[1] = (aux[1] & kmask2) | (((aux[2] >> 2) & kmask1) << 4);
6635+
utmp[0] = (aux[0] & kmask2) | (((aux[2] >> 0) & kmask1) << 4);
6636+
6637+
int8_t * scale = (int8_t *)utmp;
6638+
for (int j = 0; j < 16; ++j) scale[j] -= 32;
6639+
6640+
for (int j = 0; j < QK_K/128; ++j) {
6641+
int32x4_t isum0, isum1, isum2, isum3;
6642+
6643+
q3b[0] = vec_xl(0 , x0l);
6644+
q3b[1] = vec_xl(16, x0l);
6645+
x0l += 32;
6646+
6647+
q8bytes[0] = vec_xl(0 , y0);
6648+
q8bytes[1] = vec_xl(16 , y0);
6649+
q8bytes[2] = vec_xl(32 , y0);
6650+
q8bytes[3] = vec_xl(48 , y0);
6651+
q8bytes[4] = vec_xl(64 , y0);
6652+
q8bytes[5] = vec_xl(80 , y0);
6653+
q8bytes[6] = vec_xl(96 , y0);
6654+
q8bytes[7] = vec_xl(112, y0);
6655+
y0 += 128;
6656+
6657+
q3h[0] = vec_sl(vec_andc(v_0c, qhbits[0]), 2);
6658+
q3h[1] = vec_sl(vec_andc(v_0c, qhbits[1]), 2);
6659+
q3h[2] = vec_sl(vec_andc(v_1c, qhbits[0]), 1);
6660+
q3h[3] = vec_sl(vec_andc(v_1c, qhbits[1]), 1);
6661+
6662+
q3bytes[0] = vec_sub((int8x16_t)vec_and(q3b[0], v_3m), (int8x16_t)q3h[0]);
6663+
q3bytes[1] = vec_sub((int8x16_t)vec_and(q3b[1], v_3m), (int8x16_t)q3h[1]);
6664+
q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 2), v_3m), (int8x16_t)q3h[2]);
6665+
q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 2), v_3m), (int8x16_t)q3h[3]);
6666+
6667+
isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[0]);
6668+
isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[1]);
6669+
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[2]);
6670+
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[3]);
6671+
6672+
isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
6673+
isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
6674+
isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
6675+
isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
6676+
6677+
scale += 4;
6678+
6679+
q3h[0] = vec_andc(v_2c, qhbits[0]);
6680+
q3h[1] = vec_andc(v_2c, qhbits[1]);
6681+
q3h[2] = vec_sr(vec_andc(v_3c, qhbits[0]), 1);
6682+
q3h[3] = vec_sr(vec_andc(v_3c, qhbits[1]), 1);
6683+
6684+
q3bytes[0] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 4), v_3m), (int8x16_t)q3h[0]);
6685+
q3bytes[1] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 4), v_3m), (int8x16_t)q3h[1]);
6686+
q3bytes[2] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[0], 6), v_3m), (int8x16_t)q3h[2]);
6687+
q3bytes[3] = vec_sub((int8x16_t)vec_and(vec_sr(q3b[1], 6), v_3m), (int8x16_t)q3h[3]);
6688+
6689+
isum0 = ggml_vec_dot(v_z, q3bytes[0], q8bytes[4]);
6690+
isum1 = ggml_vec_dot(v_z, q3bytes[1], q8bytes[5]);
6691+
isum2 = ggml_vec_dot(v_z, q3bytes[2], q8bytes[6]);
6692+
isum3 = ggml_vec_dot(v_z, q3bytes[3], q8bytes[7]);
6693+
6694+
isum += (isum0[0] + isum0[1] + isum0[2] + isum0[3]) * scale[0];
6695+
isum += (isum1[0] + isum1[1] + isum1[2] + isum1[3]) * scale[1];
6696+
isum += (isum2[0] + isum2[1] + isum2[2] + isum2[3]) * scale[2];
6697+
isum += (isum3[0] + isum3[1] + isum3[2] + isum3[3]) * scale[3];
6698+
6699+
scale += 4;
6700+
6701+
if (j == 0) {
6702+
qhbits[0] = vec_sr(qhbits[0], 4);
6703+
qhbits[1] = vec_sr(qhbits[1], 4);
6704+
}
6705+
}
6706+
6707+
sum += d * isum;
6708+
}
6709+
6710+
*s = sum;
66006711
#else
66016712
// scalar version
66026713
// This function is written like this so the compiler can manage to vectorize most of it

ggml/src/ggml-cpu/ggml-cpu.cpp

+29-16
Original file line numberDiff line numberDiff line change
@@ -11,24 +11,26 @@
1111
#include <vector>
1212

1313
#ifdef GGML_USE_CPU_HBM
14-
#include "ggml-cpu-hbm.h"
14+
# include "ggml-cpu-hbm.h"
1515
#endif
1616

1717
#ifdef GGML_USE_CPU_KLEIDIAI
18-
#include "kleidiai/kleidiai.h"
19-
#endif
20-
21-
#if defined(__APPLE__)
22-
#include <sys/types.h>
23-
#include <sys/sysctl.h>
18+
# include "kleidiai/kleidiai.h"
2419
#endif
2520

2621
#if defined(_WIN32)
27-
#define WIN32_LEAN_AND_MEAN
28-
#ifndef NOMINMAX
29-
#define NOMINMAX
22+
# define WIN32_LEAN_AND_MEAN
23+
# ifndef NOMINMAX
24+
# define NOMINMAX
25+
# endif
26+
# include <windows.h>
27+
#else
28+
# include <unistd.h>
3029
#endif
31-
#include <windows.h>
30+
31+
#if defined(__APPLE__)
32+
# include <sys/sysctl.h>
33+
# include <sys/types.h>
3234
#endif
3335

3436
// ggml-backend interface
@@ -70,8 +72,10 @@ static ggml_backend_buffer_type_t * ggml_backend_cpu_device_get_extra_buffers_ty
7072
}
7173

7274
static bool ggml_backend_cpu_is_extra_buffer_type(ggml_backend_buffer_type_t buft) {
73-
for (auto extra : ggml_backend_cpu_get_extra_buffers_type()) {
74-
if (extra && extra == buft) return true;
75+
for (auto * extra : ggml_backend_cpu_get_extra_buffers_type()) {
76+
if (extra && extra == buft) {
77+
return true;
78+
}
7579
}
7680
return false;
7781
}
@@ -330,9 +334,18 @@ static const char * ggml_backend_cpu_device_get_description(ggml_backend_dev_t d
330334
}
331335

332336
static void ggml_backend_cpu_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
333-
// TODO
334-
*free = 0;
335-
*total = 0;
337+
#ifdef _WIN32
338+
MEMORYSTATUSEX status;
339+
status.dwLength = sizeof(status);
340+
GlobalMemoryStatusEx(&status);
341+
*total = status.ullTotalPhys;
342+
*free = status.ullAvailPhys;
343+
#else
344+
long pages = sysconf(_SC_PHYS_PAGES);
345+
long page_size = sysconf(_SC_PAGE_SIZE);
346+
*total = pages * page_size;
347+
*free = *total;
348+
#endif
336349

337350
GGML_UNUSED(dev);
338351
}

0 commit comments

Comments
 (0)