Skip to content

Commit 0eb313d

Browse files
committed
Merge branch 'develop'
2 parents 6ec130a + 02e824a commit 0eb313d

File tree

15 files changed

+860
-74
lines changed

15 files changed

+860
-74
lines changed

.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,6 @@
11
.idea
2+
cmake-build-*
3+
**/*.cbp
24
**/CMakeCache.txt
35
**/CMakeFiles
46
**/.DS_Store

.travis.linux

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,9 @@
11
#!/bin/sh
22

3+
rm -rf /opt/python
34
sudo add-apt-repository -y ppa:ubuntu-toolchain-r/test
45
wget http://developer.download.nvidia.com/compute/cuda/repos/ubuntu1404/x86_64/cuda-repo-ubuntu1404_8.0.44-1_amd64.deb
56
sudo dpkg -i cuda-repo-ubuntu1404_8.0.44-1_amd64.deb
67
sudo apt-get update
7-
sudo apt-get install -y --no-install-suggests --no-install-recommends g++-5 python3-dev python3-numpy cuda-cudart-dev-8-0 cuda-curand-dev-8-0 cuda-core-8-0 cuda-misc-headers-8-0
8+
sudo apt-get install -y --no-install-suggests --no-install-recommends g++-5 python3-dev python3-numpy r-base-core cuda-cudart-dev-8-0 cuda-curand-dev-8-0 cuda-core-8-0 cuda-misc-headers-8-0
89
sudo update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-5 1 --slave /usr/bin/g++ g++ /usr/bin/g++-5

.travis.osx

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,8 @@
11
#!/bin/sh
22

33
brew install llvm --with-clang
4-
brew install python3
4+
brew tap homebrew/science
5+
brew install python3 r
56
pip3 install numpy
67
brew cask update
78
brew cask install --verbose cuda

README.md

Lines changed: 147 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,8 @@ ball tree.
1818

1919
Technically, this project is a library which exports the two functions
2020
defined in `kmcuda.h`: `kmeans_cuda` and `knn_cuda`.
21-
It has the built-in Python3 native extension support, so you can
22-
`from libKMCUDA import kmeans_cuda`.
21+
It has the built-in Python3 and R native extension support, so you can
22+
`from libKMCUDA import kmeans_cuda` or `dyn.load("libKMCUDA.so")`.
2323

2424
[![source{d}](img/sourced.png)](http://sourced.tech)
2525
<p align="right"><a href="img/kmeans_image.ipynb">How this was created?</a></p>
@@ -33,16 +33,23 @@ Table of contents
3333
* [macOS](#macos)
3434
* [Testing](#testing)
3535
* [Benchmarks](#benchmarks)
36-
* [100000x256@1024](#100000x2561024)
36+
* [100,000x256@1024](#100000x2561024)
3737
* [Configuration](#configuration)
3838
* [Contestants](#contestants)
3939
* [Data](#data)
4040
* [Notes](#notes-1)
41+
* [8,000,000x256@1024](#8000000x2561024)
42+
* [Data](#data-1)
43+
* [Notes](#notes-2)
4144
* [Python examples](#python-examples)
4245
* [K-means, L2 (Euclidean) distance](#k-means-l2-euclidean-distance)
43-
* [K-means, angular (cosine) distance average](#k-means-angular-cosine-distance--average)
46+
* [K-means, angular (cosine) distance + average](#k-means-angular-cosine-distance--average)
4447
* [K-nn](#k-nn-1)
4548
* [Python API](#python-api)
49+
* [R examples](#r-examples)
50+
* [K-means](#k-means-1)
51+
* [K-nn](#k-nn-2)
52+
* [R API](#r-api)
4653
* [C examples](#c-examples)
4754
* [C API](#c-api)
4855
* [License](#license)
@@ -123,6 +130,7 @@ It requires cudart 8.0 / Pascal and OpenMP 4.0 capable compiler. The build has
123130
been tested primarily on Linux but it works on macOS too with some blows and whistles
124131
(see "macOS" subsection).
125132
If you do not want to build the Python native module, add `-D DISABLE_PYTHON=y`.
133+
If you do not want to build the R native module, add `-D DISABLE_R=y`.
126134
If CUDA is not automatically found, add `-D CUDA_TOOLKIT_ROOT_DIR=/usr/local/cuda-8.0`
127135
(change the path to the actual one). By default, CUDA kernels are compiled for
128136
the architecture 60 (Pascal). It is possible to override it via `-D CUDA_ARCH=52`,
@@ -167,8 +175,6 @@ Benchmarks
167175
----------
168176

169177
### 100000x256@1024
170-
Comparison of some KMeans implementations:
171-
172178
| | sklearn KMeans | KMeansRex | KMeansRex OpenMP | Serban | kmcuda | kmcuda 2 GPU |
173179
|------------|----------------|-----------|------------------|--------|--------|--------------|
174180
| time, s | 164 | 36 | 20 | 10.6 | 9.2 | 5.5 |
@@ -193,6 +199,21 @@ Comparison of some KMeans implementations:
193199
#### Notes
194200
100000 is the maximum size Serban KMeans can handle.
195201

202+
### 8000000x256@1024
203+
| | sklearn KMeans | KMeansRex | KMeansRex OpenMP | Serban | kmcuda 2 GPU | kmcuda Yinyang 2 GPU |
204+
|------------|----------------|-----------|------------------|--------|--------------|----------------------|
205+
| time | please no | - | 6h 34m | fail | 44m | 36m |
206+
| memory, GB | - | - | 205 | fail | 8.7 | 10.4 |
207+
208+
kmeans++ initialization, 93 iterations (1% reassignments equivalent).
209+
210+
#### Data
211+
8,000,000 secret production samples.
212+
213+
#### Notes
214+
KmeansRex did eat 205 GB of RAM on peak; it uses dynamic memory so it constantly
215+
bounced from 100 GB to 200 GB.
216+
196217
Python examples
197218
---------------
198219

@@ -276,7 +297,7 @@ calculated 0.276552 of all the distances
276297
Python API
277298
----------
278299
```python
279-
def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++",
300+
def kmeans_cuda(samples, clusters, tolerance=0.01, init="k-means++",
280301
yinyang_t=0.1, metric="L2", average_distance=False,
281302
seed=time(), device=0, verbosity=0)
282303
```
@@ -289,18 +310,20 @@ def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++",
289310

290311
**clusters** integer, the number of clusters.
291312

292-
**tolerance** float, if the relative number of reassignments drops below this value, stop.
313+
**tolerance** float, if the relative number of reassignments drops below this value,
314+
algorithm stops.
293315

294316
**init** string or numpy array, sets the method for centroids initialization,
295-
may be "k=means++"/"kmeans++", "random" or numpy array of shape
317+
may be "k-means++", "afk-mc2", "random" or numpy array of shape
296318
\[**clusters**, number of features\]. dtype must be float32.
297319

298320
**yinyang_t** float, the relative number of cluster groups, usually 0.1.
321+
0 disables Yinyang refinement.
299322

300323
**metric** str, the name of the distance metric to use. The default is Euclidean (L2),
301-
can be changed to "cos" to behave as Spherical K-means with the
302-
angular distance. Please note that samples *must* be normalized in that
303-
case.
324+
it can be changed to "cos" to change the algorithm to Spherical K-means
325+
with the angular distance. Please note that samples *must* be normalized
326+
in the latter case.
304327

305328
**average_distance** boolean, the value indicating whether to calculate
306329
the average distance between cluster elements and
@@ -309,17 +332,18 @@ def kmeans_cuda(samples, clusters, tolerance=0.0, init="k-means++",
309332

310333
**seed** integer, random generator seed for reproducible results.
311334

312-
**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device, 2 means second device,
313-
3 means using first and second device. Special value 0 enables all available devices.
314-
The default is 0.
335+
**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device,
336+
2 means second device, 3 means using first and second device. Special
337+
value 0 enables all available devices. The default is 0.
315338

316339
**verbosity** integer, 0 means complete silence, 1 means mere progress logging,
317340
2 means lots of output.
318341

319-
**return** tuple(centroids, assignments). If **samples** was a numpy array or
320-
a host pointer tuple, the types are numpy arrays, otherwise, raw pointers
321-
(integers) allocated on the same device. If **samples** are float16,
322-
the returned centroids are float16 too.
342+
**return** tuple(centroids, assignments, \[average_distance\]).
343+
If **samples** was a numpy array or a host pointer tuple, the types
344+
are numpy arrays, otherwise, raw pointers (integers) allocated on the
345+
same device. If **samples** are float16, the returned centroids are
346+
float16 too.
323347

324348
```python
325349
def knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosity=0)
@@ -342,6 +366,108 @@ def knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosit
342366
to be compatible with uint32. If **samples** is a tuple then
343367
**assignments** is a pointer. The shape is (number of samples,).
344368

369+
**metric** str, the name of the distance metric to use. The default is Euclidean (L2),
370+
it can be changed to "cos" to change the algorithm to Spherical K-means
371+
with the angular distance. Please note that samples *must* be normalized
372+
in the latter case.
373+
374+
**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device,
375+
2 means second device, 3 means using first and second device. Special
376+
value 0 enables all available devices. The default is 0.
377+
378+
**verbosity** integer, 0 means complete silence, 1 means mere progress logging,
379+
2 means lots of output.
380+
381+
**return** neighbor indices. If **samples** was a numpy array or
382+
a host pointer tuple, the return type is numpy array, otherwise, a
383+
raw pointer (integer) allocated on the same device. The shape is
384+
(number of samples, k).
385+
386+
R examples
387+
----------
388+
#### K-means
389+
```R
390+
dyn.load("libKMCUDA.so")
391+
samples = replicate(4, runif(16000))
392+
result = .External("kmeans_cuda", samples, 50, tolerance=0.01,
393+
seed=777, verbosity=1, average_distance=TRUE)
394+
print(result$average_distance)
395+
print(result$centroids[1:10,])
396+
print(result$assignments[1:10])
397+
```
398+
399+
#### K-nn
400+
```R
401+
dyn.load("libKMCUDA.so")
402+
samples = replicate(4, runif(16000))
403+
cls = .External("kmeans_cuda", samples, 50, tolerance=0.01,
404+
seed=777, verbosity=1)
405+
result = .External("knn_cuda", 20, samples, cls$centroids, cls$assignments,
406+
verbosity=1)
407+
print(result[1:10,])
408+
```
409+
410+
R API
411+
-----
412+
```R
413+
function kmeans_cuda(
414+
samples, clusters, tolerance=0.01, init="k-means++", yinyang_t=0.1,
415+
metric="L2", average_distance=FALSE, seed=Sys.time(), device=0, verbosity=0)
416+
```
417+
**samples** real matrix of shape \[number of samples, number of features\]
418+
or list of real matrices which are rbind()-ed internally. No more
419+
than INT32_MAX samples and UINT16_MAX features are supported.
420+
421+
**clusters** integer, the number of clusters.
422+
423+
**tolerance** real, if the relative number of reassignments drops below this value,
424+
algorithm stops.
425+
426+
**init** character vector or real matrix, sets the method for centroids initialization,
427+
may be "k-means++", "afk-mc2", "random" or real matrix, of shape
428+
\[**clusters**, number of features\].
429+
430+
**yinyang_t** real, the relative number of cluster groups, usually 0.1.
431+
0 disables Yinyang refinement.
432+
433+
**metric** character vector, the name of the distance metric to use. The default
434+
is Euclidean (L2), it can be changed to "cos" to change the algorithm
435+
to Spherical K-means with the angular distance. Please note that
436+
samples *must* be normalized in the latter case.
437+
438+
**average_distance** logical, the value indicating whether to calculate
439+
the average distance between cluster elements and
440+
the corresponding centroids. Useful for finding
441+
the best K. Returned as the third list element.
442+
443+
**seed** integer, random generator seed for reproducible results.
444+
445+
**device** integer, bitwise OR-ed CUDA device indices, e.g. 1 means first device,
446+
2 means second device, 3 means using first and second device. Special
447+
value 0 enables all available devices. The default is 0.
448+
449+
**verbosity** integer, 0 means complete silence, 1 means mere progress logging,
450+
2 means lots of output.
451+
452+
**return** list(centroids, assignments\[, average_distance\]). Indices in
453+
assignments start from 1.
454+
455+
```R
456+
function knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosity=0)
457+
```
458+
**k** integer, the number of neighbors to search for each sample. Must be ≤ 1<sup>16</sup>.
459+
460+
**samples** real matrix of shape \[number of samples, number of features\]
461+
or list of real matrices which are rbind()-ed internally.
462+
In the latter case, is is possible to pass in more than INT32_MAX
463+
samples.
464+
465+
**centroids** real matrix with precalculated clusters' centroids (e.g., using
466+
kmeans() or kmeans_cuda()).
467+
468+
**assignments** integer vector with sample-cluster associations. Indices start
469+
from 1.
470+
345471
**metric** str, the name of the distance metric to use. The default is Euclidean (L2),
346472
can be changed to "cos" to behave as Spherical K-means with the
347473
angular distance. Please note that samples *must* be normalized in that
@@ -354,10 +480,8 @@ def knn_cuda(k, samples, centroids, assignments, metric="L2", device=0, verbosit
354480
**verbosity** integer, 0 means complete silence, 1 means mere progress logging,
355481
2 means lots of output.
356482

357-
**return** neighbor indices. If **samples** was a numpy array or
358-
a host pointer tuple, the return type is numpy array, otherwise, a
359-
raw pointer (integer) allocated on the same device. The shape is
360-
(number of samples, k).
483+
**return** integer matrix with neighbor indices. The shape is (number of samples, k).
484+
Indices start from 1.
361485

362486
C examples
363487
----------

src/CMakeLists.txt

Lines changed: 13 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
11
cmake_minimum_required(VERSION 3.2)
22
project(KMCUDA)
3+
set(CMAKE_MODULE_PATH ${CMAKE_HOME_DIRECTORY}/cmake)
34
find_package(OpenMP REQUIRED)
45
if (APPLE AND NOT CUDA_HOST_COMPILER)
56
# https://gitlab.kitware.com/cmake/cmake/issues/13674
@@ -24,7 +25,9 @@ if (NOT DISABLE_PYTHON)
2425
execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import numpy; print(numpy.get_include())" OUTPUT_VARIABLE NUMPY_INCLUDES)
2526
endif()
2627
endif()
27-
28+
if (NOT DISABLE_R)
29+
find_package(R)
30+
endif()
2831
if (PROFILE OR CMAKE_BUILD_TYPE STREQUAL "Debug")
2932
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DPROFILE")
3033
endif()
@@ -35,9 +38,12 @@ endif()
3538
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -march=native -Wall -Werror -DCUDA_ARCH=${CUDA_ARCH} -std=c++11 ${OpenMP_CXX_FLAGS}")
3639
set(SOURCE_FILES kmcuda.cc kmcuda.h wrappers.h private.h fp_abstraction.h tricks.cuh
3740
metric_abstraction.h kmeans.cu knn.cu transpose.cu)
38-
if (NOT DISABLE_PYTHON)
41+
if (PYTHONLIBS_FOUND)
3942
list(APPEND SOURCE_FILES python.cc)
4043
endif()
44+
if (R_FOUND)
45+
list(APPEND SOURCE_FILES r.cc)
46+
endif()
4147
if (CMAKE_BUILD_TYPE STREQUAL "Debug")
4248
set(NVCC_FLAGS "-G -g")
4349
endif()
@@ -59,10 +65,14 @@ if (APPLE)
5965
set(CMAKE_SHARED_LIBRARY_CXX_FLAGS "${CMAKE_SHARED_LIBRARY_CXX_FLAGS_BACKUP}")
6066
endif()
6167
target_link_libraries(KMCUDA ${CUDA_curand_LIBRARY})
62-
if(PYTHONLIBS_FOUND)
68+
if (PYTHONLIBS_FOUND)
6369
include_directories(${PYTHON_INCLUDE_DIRS} ${NUMPY_INCLUDES})
6470
target_link_libraries(KMCUDA ${PYTHON_LIBRARIES})
6571
endif()
72+
if (R_FOUND)
73+
include_directories(${R_INCLUDE_DIRS})
74+
target_link_libraries(KMCUDA ${R_LIBRARIES})
75+
endif()
6676
if (SUFFIX)
6777
set_target_properties(KMCUDA PROPERTIES SUFFIX ${SUFFIX})
6878
endif()

src/cmake/FindR.cmake

Lines changed: 55 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,55 @@
1+
# CMake module to find R
2+
# - Try to find R
3+
# Once done, this will define
4+
#
5+
# R_FOUND - system has R
6+
# R_INCLUDE_DIRS - the R include directories
7+
# R_LIBRARIES - link these to use R
8+
# R_ROOT_DIR - As reported by R
9+
# Autor: Omar Andres Zapata Mesa 31/05/2013
10+
if(${CMAKE_SYSTEM_NAME} MATCHES "Darwin")
11+
set(CMAKE_FIND_APPBUNDLE "LAST")
12+
endif()
13+
find_program(R_EXECUTABLE NAMES R R.exe)
14+
#---searching R installtion unsing R executable
15+
if(R_EXECUTABLE)
16+
execute_process(COMMAND ${R_EXECUTABLE} RHOME
17+
OUTPUT_VARIABLE R_ROOT_DIR
18+
OUTPUT_STRIP_TRAILING_WHITESPACE)
19+
find_path(R_INCLUDE_DIR R.h
20+
HINTS ${R_ROOT_DIR}
21+
PATHS /usr/local/lib /usr/local/lib64 /usr/share
22+
PATH_SUFFIXES include R/include
23+
DOC "Path to file R.h")
24+
find_library(R_LIBRARY R
25+
HINTS ${R_ROOT_DIR}/lib
26+
DOC "R library (example libR.a, libR.dylib, etc.).")
27+
endif()
28+
#---setting include dirs and libraries
29+
set(R_LIBRARIES ${R_LIBRARY})
30+
set(R_INCLUDE_DIRS ${R_INCLUDE_DIR})
31+
foreach(_cpt ${R_FIND_COMPONENTS})
32+
execute_process(COMMAND echo "cat(find.package('${_cpt}'))"
33+
COMMAND ${R_EXECUTABLE} --vanilla --slave
34+
OUTPUT_VARIABLE _cpt_path
35+
OUTPUT_STRIP_TRAILING_WHITESPACE)
36+
find_library(R_${_cpt}_LIBRARY
37+
lib${_cpt}.so lib${_cpt}.dylib
38+
HINTS ${_cpt_path}/lib)
39+
if(R_${_cpt}_LIBRARY)
40+
mark_as_advanced(R_${_cpt}_LIBRARY)
41+
list(APPEND R_LIBRARIES ${R_${_cpt}_LIBRARY})
42+
endif()
43+
find_path(R_${_cpt}_INCLUDE_DIR ${_cpt}.h HINTS ${_cpt_path} PATH_SUFFIXES include R/include)
44+
if(R_${_cpt}_INCLUDE_DIR)
45+
mark_as_advanced(R_${_cpt}_INCLUDE_DIR)
46+
list(APPEND R_INCLUDE_DIRS ${R_${_cpt}_INCLUDE_DIR})
47+
endif()
48+
if(R_${_cpt}_INCLUDE_DIR AND R_${_cpt}_LIBRARY)
49+
list(REMOVE_ITEM R_FIND_COMPONENTS ${_cpt})
50+
endif()
51+
endforeach()
52+
# Handle the QUIETLY and REQUIRED arguments and set R_FOUND to TRUE if all listed variables are TRUE
53+
include(FindPackageHandleStandardArgs)
54+
find_package_handle_standard_args(R DEFAULT_MSG R_EXECUTABLE R_INCLUDE_DIR R_LIBRARY)
55+
mark_as_advanced(R_FOUND R_EXECUTABLE R_INCLUDE_DIR R_LIBRARY)

0 commit comments

Comments
 (0)