renaming
5440
5/data science/1/Custom+CUDA+Kernels+in+Python+with+Numba.ipynb
Normal file
2073
5/data science/1/Effective+Memory+Use.ipynb
Normal file
2071
5/data science/1/Introduction+to+CUDA+Python+with+Numba.ipynb
Normal file
0
5/data science/1e/.Rhistory
Normal file
29
5/data science/1e/Makefile
Normal file
@ -0,0 +1,29 @@
|
||||
CC = nvcc -arch=sm_75
|
||||
DEBUG ?= false
|
||||
DIRS = dist build
|
||||
|
||||
ifeq ($(DEBUG), false)
|
||||
CC += -O3
|
||||
else
|
||||
CC += -g -G
|
||||
endif
|
||||
|
||||
.PHONY: all run
|
||||
|
||||
all: $(DIRS) dist/app
|
||||
|
||||
dist/app: build/main.o build/op.o
|
||||
$(CC) $^ -o $@ -lcuda
|
||||
|
||||
build/op.o: op.ptx
|
||||
$(CC) $^ -dc -o $@
|
||||
|
||||
build/main.o: main.cu
|
||||
$(CC) $^ -ptx -o build/main.ptx
|
||||
$(CC) $^ -rdc=true -dc -o $@
|
||||
|
||||
$(DIRS):
|
||||
mkdir -p $@
|
||||
|
||||
clean:
|
||||
rm -rf $(DIRS)
|
||||
23
5/data science/1e/README.md
Normal file
@ -0,0 +1,23 @@
|
||||
[euclidean](https://web.archive.org/web/20230212044931/http://www-math.ucdenver.edu/~wcherowi/courses/m5410/exeucalg.html)
|
||||
[ecdsa1](https://sefiks.com/2018/02/16/elegant-signatures-with-elliptic-curve-cryptography/)
|
||||
[ecdsa2](https://learnmeabitcoin.com/technical/cryptography/elliptic-curve/ecdsa/)
|
||||
[ptx](https://philipfabianek.com/posts/cuda-ptx-introduction)
|
||||
|
||||
высокий приоритет
|
||||
|
||||
6, 7 State Spaces / Properties of State Spaces Ключевое отличие от CPU! В CPU память в основном плоская (RAM, кэш). В GPU есть много типов памяти: глобальная (.global), общая для блока потоков (.shared), константная (.const), локальная (.local) и т.д. Это фундамент для написания производительного кода.
|
||||
19 Cost Estimates for Accessing State-Spaces Прямое продолжение предыдущего пункта. Объясняет, какая память быстрая, а какая медленная. Критично для оптимизации.
|
||||
4 Operator Precedence Синтаксис PTX похож на ассемблер, но с выражениями. Знать приоритет операторов необходимо.
|
||||
8 Fundamental Type Specifiers Типы данных в PTX (.b8, .s16, .f32, .b64 и т.д.). Аналог byte, word, dword в x86, но с учетом специфики GPU.
|
||||
3 Predefined Identifiers Предопределенные константы, такие как %tid, %ctaid, %ntid. Это основа модели выполнения CUDA! Вместо одного потока (RIP/EIP) у вас есть идентификаторы потока, блока и сетки.
|
||||
20 Operation Types Классификация инструкций PTX. Поможет быстро ориентироваться в мануале.
|
||||
1 PTX Directives Директивы ассемблера (.version, .target, .global). Аналог секций и директив в NASM (SECTION .text, global _start)
|
||||
|
||||
средний приоритет
|
||||
|
||||
21 Scopes Области видимости для атомарных операций и барьеров (.cta, .cluster, .gpu, .sys). Важно для синхронизации.
|
||||
14, 40, 56 Различные таблицы про Swizzling и Layout Касаются продвинутых техник работы с памятью и матрицами для оптимизации доступа. Актуально для low-level оптимизаций, похоже на работу с выравниванием и SIMD в x86.
|
||||
29 Summary of Floating-Point Instructions Обзор инструкций для чисел с плавающей точкой. На GPU они крайне важны.
|
||||
30-32 Cache Operators / Eviction Priority Hints Управление кэшем. Продвинутая тема для тонкой настройки, аналогичная prefetch-инструкциям в x86.
|
||||
53, 55, 56 Таблицы про MMA (Matrix Multiply-Accumulate) Инструкции для тензорных ядер (аналог FMA в x86, но для матриц). Сердце производительности в AI/HPC.
|
||||
22-25 Comparison Operators Особенности сравнений для целых и вещественных чисел (учет NaN).
|
||||
161
5/data science/1e/main.cu
Normal file
@ -0,0 +1,161 @@
|
||||
#include <stdint.h>
|
||||
|
||||
template <typename T, int TILE_SIZE>
|
||||
__global__ void mat_mul(T *A, T *B, T *C, int N, int M, int K) {
|
||||
__shared__ T sA[TILE_SIZE][TILE_SIZE];
|
||||
__shared__ T sB[TILE_SIZE][TILE_SIZE];
|
||||
|
||||
int bx = blockIdx.x, by = blockIdx.y;
|
||||
int tx = threadIdx.x, ty = threadIdx.y;
|
||||
|
||||
int row = by * TILE_SIZE + ty;
|
||||
int col = bx * TILE_SIZE + tx;
|
||||
|
||||
if (col >= K || row >= M) return;
|
||||
|
||||
T sum = 0;
|
||||
|
||||
int tiles_len = (M + TILE_SIZE - 1) / TILE_SIZE;
|
||||
|
||||
for (int tile = 0; tile < tiles_len; tile++) {
|
||||
int aCol = tile * TILE_SIZE + tx;
|
||||
int bRow = tile * TILE_SIZE + ty;
|
||||
|
||||
if (aCol < M) {
|
||||
sA[ty][tx] = A[row * M + aCol];
|
||||
} else {
|
||||
sA[ty][tx] = 0;
|
||||
}
|
||||
|
||||
sB[ty][tx] = (T)((uint64_t)B[bRow * K + col] & ((uint64_t)(bRow >= M) - 1));
|
||||
__syncthreads();
|
||||
|
||||
for (int k = 0; k < TILE_SIZE; k++) {
|
||||
sum += sA[ty][k] * sB[k][tx];
|
||||
}
|
||||
}
|
||||
|
||||
C[row * K + col] = sum;
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
__global__ void dumb_mat_mul(T *A, T *B, T *C, int N, int M, int K) {
|
||||
int col = blockIdx.x * blockDim.x + threadIdx.x;
|
||||
int row = blockIdx.y * blockDim.y + threadIdx.y;
|
||||
|
||||
if (col >= K || row >= M) return;
|
||||
|
||||
T sum = 0;
|
||||
for (int i = 0; i < M; i++) {
|
||||
sum += A[row * M + i] * B[i * K + col];
|
||||
}
|
||||
C[row * K + col] = sum;
|
||||
}
|
||||
|
||||
#define N 1024
|
||||
#define M 1024
|
||||
#define K 1024
|
||||
#define NO_PRINT 1
|
||||
#define GRID_DIM 1
|
||||
#define BLOCK_DIM 32
|
||||
|
||||
#define MAT_TYPE int
|
||||
#define MAT_FMT "%d\t"
|
||||
#define A_LEN (N * M)
|
||||
#define B_LEN (M * K)
|
||||
#define C_LEN (N * K)
|
||||
#define A_SIZE (sizeof(MAT_TYPE) * N * M)
|
||||
#define B_SIZE (sizeof(MAT_TYPE) * M * K)
|
||||
#define C_SIZE (sizeof(MAT_TYPE) * N * K)
|
||||
|
||||
#include <cstdio>
|
||||
#include <random>
|
||||
#include <chrono>
|
||||
using namespace std::chrono;
|
||||
|
||||
template <typename T>
|
||||
void mat_print(T *a, const char *fmt, int n, int m) {
|
||||
for (auto row = 0; row < n; row++) {
|
||||
for (auto col = 0; col < m; col++) {
|
||||
printf(fmt, a[row * m + col]);
|
||||
}
|
||||
printf("\n");
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
std::random_device rd;
|
||||
std::mt19937 engine(rd());
|
||||
std::uniform_int_distribution<MAT_TYPE> dist(1, 10);
|
||||
|
||||
auto buf = (MAT_TYPE *)malloc(A_SIZE + B_SIZE + C_SIZE);
|
||||
for (auto i = 0; i < A_LEN + B_LEN; i++) {
|
||||
buf[i] = dist(engine);
|
||||
}
|
||||
|
||||
MAT_TYPE *a = buf;
|
||||
MAT_TYPE *b = a + A_LEN;
|
||||
MAT_TYPE *c = b + B_LEN;
|
||||
|
||||
#if NO_PRINT==0
|
||||
printf("\na\n");
|
||||
mat_print(a, MAT_FMT, N, M);
|
||||
printf("\nb\n");
|
||||
mat_print(b, MAT_FMT, M, K);
|
||||
#endif
|
||||
|
||||
MAT_TYPE *d_a, *d_b, *d_c;
|
||||
cudaMalloc(&d_a, A_SIZE);
|
||||
cudaMalloc(&d_b, B_SIZE);
|
||||
cudaMalloc(&d_c, C_SIZE);
|
||||
|
||||
cudaMemcpy(d_a, a, A_SIZE, cudaMemcpyHostToDevice);
|
||||
cudaMemcpy(d_b, b, B_SIZE, cudaMemcpyHostToDevice);
|
||||
|
||||
dim3 gridDim(GRID_DIM, GRID_DIM);
|
||||
dim3 blockDim(BLOCK_DIM, BLOCK_DIM);
|
||||
|
||||
int cycles = 0;
|
||||
microseconds duration(0);
|
||||
|
||||
while (duration.count() < 1e6) {
|
||||
auto start = high_resolution_clock::now();
|
||||
mat_mul<MAT_TYPE, BLOCK_DIM><<<gridDim, blockDim>>>(d_a, d_b, d_c, N, M, K);
|
||||
cudaDeviceSynchronize();
|
||||
auto end = high_resolution_clock::now();
|
||||
|
||||
cycles++;
|
||||
duration += duration_cast<microseconds>(end - start);
|
||||
}
|
||||
|
||||
#if NO_PRINT==0
|
||||
cudaMemcpy(c, d_c, C_SIZE, cudaMemcpyDeviceToHost);
|
||||
printf("\nc\n");
|
||||
mat_print(c, MAT_FMT, N, K);
|
||||
#endif
|
||||
printf("optimized mul take %f usec avg in %d cycles\n", (float)(duration.count()) / cycles, cycles);
|
||||
|
||||
cycles = 0;
|
||||
duration = microseconds(0);
|
||||
while (duration.count() < 1e6) {
|
||||
auto start = high_resolution_clock::now();
|
||||
dumb_mat_mul<MAT_TYPE><<<gridDim, blockDim>>>(d_a, d_b, d_c, N, M, K);
|
||||
cudaDeviceSynchronize();
|
||||
auto end = high_resolution_clock::now();
|
||||
|
||||
cycles++;
|
||||
duration += duration_cast<microseconds>(end - start);
|
||||
}
|
||||
|
||||
#if NO_PRINT==0
|
||||
cudaMemcpy(c, d_c, C_SIZE, cudaMemcpyDeviceToHost);
|
||||
printf("\nc\n");
|
||||
mat_print(c, MAT_FMT, N, K);
|
||||
#endif
|
||||
printf("dumb mul take %f usec avg in %d cycles\n", (float)(duration.count()) / cycles, cycles);
|
||||
|
||||
cudaFree(a);
|
||||
cudaFree(b);
|
||||
cudaFree(c);
|
||||
free(buf);
|
||||
}
|
||||
27
5/data science/1e/main.py
Normal file
@ -0,0 +1,27 @@
|
||||
import sys, time, math
|
||||
import numpy as np
|
||||
import cupy as cp
|
||||
|
||||
def measure(a, b):
|
||||
duration = 0
|
||||
cycles = 0
|
||||
while (duration < 1):
|
||||
start = time.perf_counter()
|
||||
c = a @ b
|
||||
cp.cuda.Stream.null.synchronize()
|
||||
end = time.perf_counter()
|
||||
|
||||
duration += end - start
|
||||
cycles += 1
|
||||
|
||||
return duration / cycles
|
||||
|
||||
n = 1024
|
||||
|
||||
a = np.random.rand(n, n).astype(np.float32)
|
||||
b = np.random.rand(n, n).astype(np.float32)
|
||||
print('numpy take', measure(a, b) * 1e6, 'usec')
|
||||
|
||||
a = cp.random.rand(n, n, dtype = cp.float32)
|
||||
b = cp.random.rand(n, n, dtype = cp.float32)
|
||||
print('cupy take', measure(a, b) * 1e6, 'usec')
|
||||
171
5/data science/1e/op.ptx
Normal file
@ -0,0 +1,171 @@
|
||||
.version 8.4
|
||||
.target sm_75
|
||||
.address_size 64
|
||||
|
||||
.visible .func add_u16(
|
||||
.param .b64 out_c,
|
||||
.param .align 16 .b8 in_a[16],
|
||||
.param .align 16 .b8 in_b[16]
|
||||
) {
|
||||
.reg .u64 %ra<2>, %rb<2>;
|
||||
.reg .b64 %rdc;
|
||||
|
||||
ld.param.b64 %rdc, [out_c];
|
||||
|
||||
ld.param.v2.u64 {%ra1, %ra0}, [in_a];
|
||||
ld.param.v2.u64 {%rb1, %rb0}, [in_b];
|
||||
|
||||
add.cc.u64 %ra0, %ra0, %rb0;
|
||||
addc.u64 %ra1, %ra1, %rb1;
|
||||
|
||||
st.v2.u64 [%rdc], {%ra1, %ra0};
|
||||
|
||||
ret;
|
||||
}
|
||||
|
||||
.visible .func sub_u16(
|
||||
.param .b64 out_c,
|
||||
.param .align 16 .b8 in_a[16],
|
||||
.param .align 16 .b8 in_b[16]
|
||||
) {
|
||||
.reg .u64 %ra<2>, %rb<2>;
|
||||
.reg .b64 %rdc;
|
||||
|
||||
ld.param.b64 %rdc, [out_c];
|
||||
|
||||
ld.param.v2.u64 {%ra1, %ra0}, [in_a];
|
||||
ld.param.v2.u64 {%rb1, %rb0}, [in_b];
|
||||
|
||||
sub.cc.u64 %ra0, %ra0, %rb0;
|
||||
subc.u64 %ra1, %ra1, %rb1;
|
||||
|
||||
st.v2.u64 [%rdc], {%ra1, %ra0};
|
||||
|
||||
ret;
|
||||
}
|
||||
|
||||
.visible .func add_u32(
|
||||
.param .b64 out_c,
|
||||
.param .align 16 .b8 in_a[32],
|
||||
.param .align 16 .b8 in_b[32]
|
||||
) {
|
||||
.reg .u64 %ra<4>, %rb<4>;
|
||||
.reg .b64 %rdc;
|
||||
|
||||
ld.param.b64 %rdc, [out_c];
|
||||
|
||||
ld.param.v2.u64 {%ra3, %ra2}, [in_a];
|
||||
ld.param.v2.u64 {%ra1, %ra0}, [in_a + 16];
|
||||
ld.param.v2.u64 {%rb3, %rb2}, [in_b];
|
||||
ld.param.v2.u64 {%rb1, %rb0}, [in_b + 16];
|
||||
|
||||
add.cc.u64 %ra0, %ra0, %rb0;
|
||||
addc.cc.u64 %ra1, %ra1, %rb1;
|
||||
addc.cc.u64 %ra2, %ra2, %rb2;
|
||||
addc.u64 %ra3, %ra3, %rb3;
|
||||
|
||||
st.v2.u64 [%rdc], {%ra3, %ra2};
|
||||
st.v2.u64 [%rdc + 16], {%ra1, %ra0};
|
||||
|
||||
ret;
|
||||
}
|
||||
|
||||
.visible .func sub_u32(
|
||||
.param .b64 out_c,
|
||||
.param .align 16 .b8 in_a[32],
|
||||
.param .align 16 .b8 in_b[32]
|
||||
) {
|
||||
.reg .u64 %ra<4>, %rb<4>;
|
||||
.reg .b64 %rdc;
|
||||
|
||||
ld.param.b64 %rdc, [out_c];
|
||||
|
||||
ld.param.v2.u64 {%ra3, %ra2}, [in_a];
|
||||
ld.param.v2.u64 {%ra1, %ra0}, [in_a + 16];
|
||||
ld.param.v2.u64 {%rb3, %rb2}, [in_b];
|
||||
ld.param.v2.u64 {%rb1, %rb0}, [in_b + 16];
|
||||
|
||||
sub.cc.u64 %ra0, %ra0, %rb0;
|
||||
subc.cc.u64 %ra1, %ra1, %rb1;
|
||||
subc.cc.u64 %ra2, %ra2, %rb2;
|
||||
subc.u64 %ra3, %ra3, %rb3;
|
||||
|
||||
st.v2.u64 [%rdc], {%ra3, %ra2};
|
||||
st.v2.u64 [%rdc + 16], {%ra1, %ra0};
|
||||
|
||||
ret;
|
||||
}
|
||||
|
||||
.visible .func mul_lo_u16(
|
||||
.param .b64 out_c,
|
||||
.param .align 16 .b8 in_a[16],
|
||||
.param .align 16 .b8 in_b[16]
|
||||
) {
|
||||
.reg .u64 %a, %b, %c, %d, %a_b, %c_d;
|
||||
.reg .u64 %ac, %bd_hi, %bd_lo, %p;
|
||||
.reg .b64 %rdc;
|
||||
|
||||
ld.param.b64 %rdc, [out_c];
|
||||
|
||||
ld.param.v2.u64 {%a, %b}, [in_a];
|
||||
ld.param.v2.u64 {%c, %d}, [in_b];
|
||||
|
||||
mul.lo.u64 %ac, %a, %c;
|
||||
mul.lo.u64 %bd_lo, %b, %d;
|
||||
mul.hi.u64 %bd_hi, %b, %d;
|
||||
|
||||
add.u64 %a_b, %a, %b;
|
||||
add.u64 %c_d, %c, %d;
|
||||
|
||||
mul.lo.u64 %p, %a_b, %c_d;
|
||||
|
||||
sub.u64 %p, %p, %ac;
|
||||
sub.u64 %p, %p, %bd_lo;
|
||||
|
||||
add.u64 %p, %p, %bd_hi;
|
||||
|
||||
st.v2.u64 [%rdc], {%p, %bd_lo};
|
||||
|
||||
ret;
|
||||
}
|
||||
|
||||
.visible .func mul_u16(
|
||||
.param .b64 out_c_hi,
|
||||
.param .b64 out_c_lo,
|
||||
.param .align 16 .b8 in_a[16],
|
||||
.param .align 16 .b8 in_b[16]
|
||||
) {
|
||||
.reg .u64 %a, %b, %c, %d;
|
||||
.reg .u64 %a_b_hi, %a_b_lo, %c_d_hi, %c_d_lo;
|
||||
.reg .u64 %p_hi, %p_lo, %p_hi2, %p_lo2;
|
||||
.reg .u64 %ac_hi, %ac_lo, %bd_hi, %bd_lo;
|
||||
.reg .b64 %rdc_hi, %rdc_lo;
|
||||
|
||||
ld.param.b64 %rdc_hi, [out_c_hi];
|
||||
ld.param.b64 %rdc_lo, [out_c_lo];
|
||||
|
||||
ld.param.v2.u64 {%a, %b}, [in_a];
|
||||
ld.param.v2.u64 {%c, %d}, [in_b];
|
||||
|
||||
mul.lo.u64 %ac_lo, %a, %c;
|
||||
mul.hi.u64 %ac_hi, %a, %c;
|
||||
mul.lo.u64 %bd_lo, %b, %d;
|
||||
mul.hi.u64 %bd_hi, %b, %d;
|
||||
|
||||
add.cc.u64 %a_b_lo, %a, %b;
|
||||
addc.u64 %a_b_hi, %a, %b;
|
||||
add.cc.u64 %c_d_lo, %c, %d;
|
||||
addc.u64 %c_d_hi, %c, %d;
|
||||
|
||||
mul.lo.u64 %p_lo, %a_b_lo, %c_d_lo;
|
||||
mul.hi.u64 %p_hi, %a_b_lo, %c_d_lo;
|
||||
mul.lo.u64 %p_hi2, %a_b_hi, %c_d_hi;
|
||||
|
||||
|
||||
|
||||
|
||||
st.v2.u64 [%rdc_lo], {%p_hi, %p_lo};
|
||||
st.v2.u64 [%rdc_hi], {%a_b_lo, %p_hi2};
|
||||
|
||||
ret;
|
||||
}
|
||||
205
5/data science/1e/secp256k1.cu
Normal file
@ -0,0 +1,205 @@
|
||||
#include <stdio.h>
|
||||
#include <stdint.h>
|
||||
|
||||
extern "C" __device__ void add_u16(
|
||||
ulonglong2 *out_c,
|
||||
ulonglong2 in_a,
|
||||
ulonglong2 in_b
|
||||
);
|
||||
|
||||
extern "C" __device__ void sub_u16(
|
||||
ulonglong2 *out_c,
|
||||
ulonglong2 in_a,
|
||||
ulonglong2 in_b
|
||||
);
|
||||
|
||||
extern "C" __device__ void add_u32(
|
||||
ulonglong4 *out_c,
|
||||
ulonglong4 in_a,
|
||||
ulonglong4 in_b
|
||||
);
|
||||
|
||||
extern "C" __device__ void sub_u32(
|
||||
ulonglong4 *out_c,
|
||||
ulonglong4 in_a,
|
||||
ulonglong4 in_b
|
||||
);
|
||||
|
||||
extern "C" __device__ void mul_lo_u16(
|
||||
ulonglong2 *out_c,
|
||||
ulonglong2 in_a,
|
||||
ulonglong2 in_b
|
||||
);
|
||||
|
||||
extern "C" __device__ void mul_u16(
|
||||
ulonglong2 *out_c_hi,
|
||||
ulonglong2 *out_c_lo,
|
||||
ulonglong2 in_a,
|
||||
ulonglong2 in_b
|
||||
);
|
||||
|
||||
__device__ bool equ_u16(ulonglong2 a, ulonglong2 b) {
|
||||
return a.x == b.x && a.y == b.y;
|
||||
}
|
||||
|
||||
__device__ bool equ_u32(ulonglong4 a, ulonglong4 b) {
|
||||
return a.x == b.x &&
|
||||
a.y == b.y &&
|
||||
a.z == b.z &&
|
||||
a.w == b.w;
|
||||
}
|
||||
|
||||
__device__ int cmp_u32(ulonglong4 a, ulonglong4 b) {
|
||||
if (a.x < b.x)
|
||||
return -1;
|
||||
else if (a.x > b.x)
|
||||
return 1;
|
||||
|
||||
if (a.y < b.y)
|
||||
return -1;
|
||||
else if (a.y > b.y)
|
||||
return 1;
|
||||
|
||||
if (a.z < b.z)
|
||||
return -1;
|
||||
else if (a.z > b.z)
|
||||
return 1;
|
||||
|
||||
if (a.w < b.w)
|
||||
return -1;
|
||||
else if (a.w > b.w)
|
||||
return 1;
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
__device__ void mul_lo_u32(
|
||||
ulonglong4 *out_c,
|
||||
ulonglong4 in_a,
|
||||
ulonglong4 in_b
|
||||
) {
|
||||
auto a = (ulonglong2 *)&in_a.x;
|
||||
auto b = (ulonglong2 *)&in_a.z;
|
||||
auto c = (ulonglong2 *)&in_b.x;
|
||||
auto d = (ulonglong2 *)&in_b.z;
|
||||
ulonglong2 a_b, c_d, ac, bd_hi, bd_lo, p;
|
||||
|
||||
mul_lo_u16(&ac, *a, *c);
|
||||
mul_u16(&bd_hi, &bd_lo, *b, *d);
|
||||
|
||||
add_u16(&a_b, *a, *b);
|
||||
add_u16(&c_d, *c, *d);
|
||||
|
||||
mul_lo_u16(&p, a_b, c_d);
|
||||
|
||||
sub_u16(&p, p, ac);
|
||||
sub_u16(&p, p, bd_lo);
|
||||
add_u16(&p, p, bd_hi);
|
||||
|
||||
out_c->x = p.x;
|
||||
out_c->y = p.y;
|
||||
out_c->z = bd_lo.x;
|
||||
out_c->w = bd_lo.y;
|
||||
}
|
||||
|
||||
__device__ void print_u16(ulonglong2 a) {
|
||||
printf("0x%016llx.%016llx\n", a.x, a.y);
|
||||
}
|
||||
|
||||
__device__ void print_u32(ulonglong4 a) {
|
||||
printf("0x%016llx.%016llx.%016llx.%016llx\n", a.x, a.y, a.z, a.w);
|
||||
}
|
||||
|
||||
#define U8_MAX 0xFFFFFFFFFFFFFFFF
|
||||
#define U16_MAX {U8_MAX, U8_MAX}
|
||||
#define U32_MAX {U8_MAX, U8_MAX, U8_MAX, U8_MAX}
|
||||
|
||||
__global__ void test(bool *passed) {
|
||||
*passed = true;
|
||||
{
|
||||
ulonglong4 a = U32_MAX;
|
||||
ulonglong4 b = {0, 0, 0, 1};
|
||||
ulonglong4 c = {0, 0, 0, 0};
|
||||
add_u32(&a, a, b);
|
||||
if (!equ_u32(a, c)) {
|
||||
printf("add_u32\n");
|
||||
print_u32(a);
|
||||
*passed = false;
|
||||
}
|
||||
}
|
||||
{
|
||||
ulonglong4 a = {0, 0, 0, 0};
|
||||
ulonglong4 b = {0, 0, 0, 1};
|
||||
ulonglong4 c = U32_MAX;
|
||||
sub_u32(&a, a, b);
|
||||
if (!equ_u32(a, c)) {
|
||||
printf("sub_u32\n");
|
||||
print_u32(a);
|
||||
*passed = false;
|
||||
}
|
||||
}
|
||||
{
|
||||
ulonglong2 a = U16_MAX;
|
||||
ulonglong2 b = {0, U8_MAX};
|
||||
ulonglong2 c = {U8_MAX, 1};
|
||||
mul_lo_u16(&a, a, b);
|
||||
if (!equ_u16(a, c)) {
|
||||
printf("mul_lo_u16\n");
|
||||
print_u16(a);
|
||||
*passed = false;
|
||||
}
|
||||
}
|
||||
{
|
||||
ulonglong2 a = U16_MAX;
|
||||
ulonglong2 b = {0, U8_MAX};
|
||||
ulonglong2 c_hi = {0, U8_MAX - 1};
|
||||
ulonglong2 c_lo = {U8_MAX, 1};
|
||||
mul_u16(&a, &b, a, b);
|
||||
if (!equ_u16(a, c_hi) || !equ_u16(b, c_lo)) {
|
||||
printf("mul_u16\n");
|
||||
print_u16(a);
|
||||
print_u16(b);
|
||||
*passed = false;
|
||||
}
|
||||
a = U16_MAX;
|
||||
b = U16_MAX;
|
||||
c_hi = {U8_MAX, U8_MAX - 1};
|
||||
c_lo = {0, 1};
|
||||
mul_u16(&a, &b, a, b);
|
||||
if (!equ_u16(a, c_hi) || !equ_u16(b, c_lo)) {
|
||||
printf("mul_u16\n");
|
||||
print_u16(a);
|
||||
print_u16(b);
|
||||
*passed = false;
|
||||
}
|
||||
}
|
||||
{
|
||||
ulonglong4 a = U32_MAX;
|
||||
ulonglong4 b = {0, 0, U8_MAX, U8_MAX};
|
||||
ulonglong4 c = {U8_MAX, U8_MAX, 0, 1};
|
||||
mul_lo_u32(&a, a, b);
|
||||
if (!equ_u32(a, c)) {
|
||||
printf("mul_lo_u32\n");
|
||||
print_u32(a);
|
||||
*passed = false;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
int main() {
|
||||
bool test_passed, *d_test_passed;
|
||||
cudaMalloc(&d_test_passed, sizeof(bool));
|
||||
|
||||
test<<<1, 1>>>(d_test_passed);
|
||||
cudaDeviceSynchronize();
|
||||
|
||||
cudaMemcpy(&test_passed, d_test_passed, sizeof(bool), cudaMemcpyDeviceToHost);
|
||||
cudaFree(d_test_passed);
|
||||
|
||||
if (!test_passed) {
|
||||
printf("test not passed\n");
|
||||
return 1;
|
||||
}
|
||||
|
||||
return 0;
|
||||
}
|
||||
22
5/data science/1e/test.py
Normal file
@ -0,0 +1,22 @@
|
||||
U8_MAX = 0xFFFFFFFFFFFFFFFF
|
||||
U16_MAX = U8_MAX << 64 | U8_MAX
|
||||
U32_MAX = U16_MAX << 128 | U16_MAX
|
||||
|
||||
|
||||
def dothex(num):
|
||||
strhex = hex(num)[2:]
|
||||
dothex = strhex[-16:]
|
||||
strhex = strhex[:-16]
|
||||
|
||||
while len(strhex) > 0:
|
||||
dothex = strhex[-16:] + '.' + dothex
|
||||
strhex = strhex[:-16]
|
||||
|
||||
return '0x' + dothex
|
||||
|
||||
print('mul_u16', dothex((U16_MAX * U8_MAX >> 128) %
|
||||
(U16_MAX + 1)), dothex(U16_MAX * U8_MAX % (U16_MAX + 1)))
|
||||
print('mul_u16', dothex((U16_MAX * U16_MAX >> 128) %
|
||||
(U16_MAX + 1)), dothex(U16_MAX * U16_MAX % (U16_MAX + 1)))
|
||||
print('mul_lo_u32', dothex(U32_MAX * U16_MAX % (U32_MAX + 1)))
|
||||
print('div_lo_u32', dothex(U32_MAX // U8_MAX), dothex(U32_MAX - U32_MAX // U8_MAX))
|
||||
2477
5/data science/2/1-02_data_manipulation.ipynb
Normal file
958
5/data science/2/1-03_memory_management.ipynb
Normal file
@ -0,0 +1,958 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "def31b0f-921a-43eb-9807-8b9b31eb7b32",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"./images/DLI_Header.png\" width=400/>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "4a0fd4dd-f7be-4c90-8ddd-384a760ac04f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Fundamentals of Accelerated Data Science # "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6a8fdf2e-a481-455e-8a52-8be8472b63bf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 03 - Memory Management ##\n",
|
||||
"\n",
|
||||
"**Table of Contents**\n",
|
||||
"<br>\n",
|
||||
"This notebook explores the dynamics between data and memory. This notebook covers the below sections: \n",
|
||||
"1. [Memory Management](#Memory-Management)\n",
|
||||
" * [Memory Usage](#Memory-Usage)\n",
|
||||
"2. [Data Types](#Data-Types)\n",
|
||||
" * [Convert Data Types](#Convert-Data-Types)\n",
|
||||
" * [Exercise #1 - Modify `dtypes`](#Exercise-#1---Modify-dtypes)\n",
|
||||
" * [Categorical](#Categorical)\n",
|
||||
"3. [Efficient Data Loading](#Efficient-Data-Loading)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1b59367c-48bc-4c72-b1f4-4cfdfa5470cf",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Memory Management ##\n",
|
||||
"During the data acquisition process, data is transferred to memory in order to be operated on by the processor. Memory management is crucial for cuDF and GPU operations for several key reasons: \n",
|
||||
"* **Limited GPU memory**: GPUs typically have less memory than CPUs, therefore efficient memory management is essential to maximize the use of available GPU memory, especially for large datasets.\n",
|
||||
"* **Data transfer overhead**: Transferring data between CPU and GPU memory is relatively slow compared to GPU computation speed. Minimizing these transfers through smart memory management is critical for performance.\n",
|
||||
"* **Performance tuning**: Understanding and optimizing memory usage is key to achieving peak performance in GPU-accelerated data processing tasks.\n",
|
||||
"\n",
|
||||
"When done correctly, keeping the data on the GPU can enable cuDF and the RAPIDS ecosystem to achieve significant performance improvements, handle larger datasets, and provide more efficient data processing capabilities. \n",
|
||||
"\n",
|
||||
"Below we import the data from the csv file. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "b7b8a623-f799-4dad-aca9-0e571bb6e527",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"import pandas as pd\n",
|
||||
"import random\n",
|
||||
"import time"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "711d0a7f-8598-49fc-949c-5caf6029ce47",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>age</th>\n",
|
||||
" <th>sex</th>\n",
|
||||
" <th>county</th>\n",
|
||||
" <th>lat</th>\n",
|
||||
" <th>long</th>\n",
|
||||
" <th>name</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>m</td>\n",
|
||||
" <td>DARLINGTON</td>\n",
|
||||
" <td>54.533644</td>\n",
|
||||
" <td>-1.524401</td>\n",
|
||||
" <td>FRANCIS</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>m</td>\n",
|
||||
" <td>DARLINGTON</td>\n",
|
||||
" <td>54.426256</td>\n",
|
||||
" <td>-1.465314</td>\n",
|
||||
" <td>EDWARD</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>m</td>\n",
|
||||
" <td>DARLINGTON</td>\n",
|
||||
" <td>54.555200</td>\n",
|
||||
" <td>-1.496417</td>\n",
|
||||
" <td>TEDDY</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>m</td>\n",
|
||||
" <td>DARLINGTON</td>\n",
|
||||
" <td>54.547906</td>\n",
|
||||
" <td>-1.572341</td>\n",
|
||||
" <td>ANGUS</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>0</td>\n",
|
||||
" <td>m</td>\n",
|
||||
" <td>DARLINGTON</td>\n",
|
||||
" <td>54.477639</td>\n",
|
||||
" <td>-1.605995</td>\n",
|
||||
" <td>CHARLIE</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" age sex county lat long name\n",
|
||||
"0 0 m DARLINGTON 54.533644 -1.524401 FRANCIS\n",
|
||||
"1 0 m DARLINGTON 54.426256 -1.465314 EDWARD\n",
|
||||
"2 0 m DARLINGTON 54.555200 -1.496417 TEDDY\n",
|
||||
"3 0 m DARLINGTON 54.547906 -1.572341 ANGUS\n",
|
||||
"4 0 m DARLINGTON 54.477639 -1.605995 CHARLIE"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"df=pd.read_csv('./data/uk_pop.csv')\n",
|
||||
"\n",
|
||||
"# preview\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "36416fd0-7081-42aa-bf31-d1231b81ec0b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Memory Usage ###\n",
|
||||
"Memory utilization of a DataFrame depends on the date types for each column.\n",
|
||||
"\n",
|
||||
"<p><img src='images/dtypes.png' width=720></p>\n",
|
||||
"\n",
|
||||
"We can use `DataFrame.memory_usage()` to see the memory usage for each column (in bytes). Most of the common data types have a fixed size in memory, such as `int`, `float`, `datetime`, and `bool`. Memory usage for these data types is the respective memory requirement multiplied by the number of data points. For `string` data type, the memory usage reported _for pandas_ is the number of elements times 8 bytes. This accounts for the 64-bit required for the pointer that points to an address in memory but not the memory used for the actual string values. The actual memory required for a string value is 49 bytes plus an additional byte for each character. The `deep` parameter provides a more accurate memory usage report that accounts for the system-level memory consumption of the contained `string` data type. \n",
|
||||
"\n",
|
||||
"Below we get the memory usage. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "8378207b-2d9e-4102-8408-c2dddafc8a40",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Index 128\n",
|
||||
"age 467839152\n",
|
||||
"sex 3391833852\n",
|
||||
"county 3934985133\n",
|
||||
"lat 467839152\n",
|
||||
"long 467839152\n",
|
||||
"name 3666922374\n",
|
||||
"dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"# pandas memory utilization\n",
|
||||
"mem_usage_df=df.memory_usage(deep=True)\n",
|
||||
"mem_usage_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "07c24bb1-c4f7-440c-a949-d4c57800ec61",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Below we define a `make_decimal()` function to convert memory size into units based on powers of 2. In contrast to units based on powers of 10, this customary convention is commonly used to report memory capacity. More information about the two definitions can be found [here](https://en.wikipedia.org/wiki/Byte#Multiple-byte_units). "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "5ae42218-1547-49fd-9123-ab508a2b03de",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"suffixes = ['B', 'kB', 'MB', 'GB', 'TB', 'PB']\n",
|
||||
"def make_decimal(nbytes):\n",
|
||||
" i=0\n",
|
||||
" while nbytes >= 1024 and i < len(suffixes)-1:\n",
|
||||
" nbytes/=1024.\n",
|
||||
" i+=1\n",
|
||||
" f=('%.2f' % nbytes).rstrip('0').rstrip('.')\n",
|
||||
" return '%s %s' % (f, suffixes[i])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "e6d4a613-3eea-4dce-8e71-39593ff6f226",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"'11.55 GB'"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"make_decimal(mem_usage_df.sum())"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a352c0b2-65aa-4231-b753-556aca46ff49",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Below we calculate the memory usage manually based on the data types. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "630327b9-6dc1-4b70-9fdf-9f7763ec4d50",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Numerical columns use 467839152 bytes of memory\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"# get number of rows\n",
|
||||
"num_rows=len(df)\n",
|
||||
"\n",
|
||||
"# 64-bit numbers uses 8 bytes of memory\n",
|
||||
"print(f'Numerical columns use {num_rows*8} bytes of memory')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "bb22b5f4-e38f-438e-9426-61746b509e50",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"county column uses 3934985133 bytes of memory.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"# check random string-typed column\n",
|
||||
"string_cols=[col for col in df.columns if df[col].dtype=='object' ]\n",
|
||||
"column_to_check=random.choice(string_cols)\n",
|
||||
"\n",
|
||||
"overhead=49\n",
|
||||
"pointer_size=8\n",
|
||||
"\n",
|
||||
"# nan==nan when value is not a number\n",
|
||||
"# nan uses 32 bytes of memory\n",
|
||||
"string_col_mem_usage_df=df[column_to_check].map(lambda x: len(x)+overhead+pointer_size if x else 32)\n",
|
||||
"string_col_mem_usage=string_col_mem_usage_df.sum()\n",
|
||||
"print(f'{column_to_check} column uses {string_col_mem_usage} bytes of memory.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "94e393c2-c0d0-40ee-82d2-730c4667e9b8",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Note**: The `string` data type is stored differently in cuDF than it is in pandas. More information about `libcudf` stores string data using the [Arrow format](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) can be found [here](https://developer.nvidia.com/blog/mastering-string-transformations-in-rapids-libcudf/). "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "737ff50b-9426-4e08-a00a-d7ee69f48b9f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Data Types ##\n",
|
||||
"By default, pandas (and cuDF) uses 64-bit for numerical values. Using 64-bit numbers provides the highest precision but many applications do not require 64-bit precision when aggregating over a very large number of data points. When possible, using 32-bit numbers reduces storage and memory requirements in half, and also typically greatly speeds up computations because only half as much data needs to be accessed in memory. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0b77d450-c415-44b8-87ac-20ce616ec809",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Convert Data Types ###\n",
|
||||
"The `.astype()` method can be used to convert numerical data types to use different bit-size containers. Here we convert the `age` column from `int64` to `int8`. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "603f7c70-134e-4466-a790-8a18b9088ca6",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"age int8\n",
|
||||
"sex object\n",
|
||||
"county object\n",
|
||||
"lat float64\n",
|
||||
"long float64\n",
|
||||
"name object\n",
|
||||
"dtype: object"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"df['age']=df['age'].astype('int8')\n",
|
||||
"\n",
|
||||
"df.dtypes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "973a6dd4-2aef-44d9-8b01-8853032eddae",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Exercise #1 - Modify `dtypes` ###\n",
|
||||
"**Instructions**: <br>\n",
|
||||
"* Modify the `<FIXME>` only and execute the below cell to convert any 64-bit data types to their 32-bit counterparts."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "beb7d71b-6672-462e-b65c-a64dbe5f7a57",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"df['lat']=df['lat'].astype('float32')\n",
|
||||
"df['long']=df['long'].astype('float32')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "3b44fb22-a0f1-4e43-a332-1ccbad50caee",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"\n",
|
||||
"df['lat']=df['lat'].astype('float32')\n",
|
||||
"df['long']=df['long'].astype('float32')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "98b6542d-22cc-4926-b600-a3e052c37c96",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Click ... for solution. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7b2cd622-977c-4915-a87f-2fe03c1793f5",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Categorical ###\n",
|
||||
"Categorical data is a type of data that represents discrete, distinct categories or groups. They can have a meaningful order or ranking but generally cannot be used for numerical operations. When appropriate, using the `categorical` data type can reduce memory usage and lead to faster operations. It can also be used to define and maintain a custom order of categories. \n",
|
||||
"\n",
|
||||
"Below we get the number of unique values in the string columns. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "f249e4b8-5d7a-4b44-ac15-bd3360a43f2a",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"sex 2\n",
|
||||
"county 171\n",
|
||||
"name 13212\n",
|
||||
"dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 11,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"df.select_dtypes(include='object').nunique()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f1d8bd88-b39b-4043-9039-d8bd75fe851a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Below we convert columns with few discrete values to `category`. The `category` data type has `.categories` and `codes` properties that are accessed through `.cat`. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "a99bebbf-2e5b-4720-96f9-9fd7d42d2fe8",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"df['sex']=df['sex'].astype('category')\n",
|
||||
"df['county']=df['county'].astype('category')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "41b7b290-cfcf-4ff6-b6b4-454c19b44a62",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Index(['BARKING AND DAGENHAM', 'BARNET', 'BARNSLEY',\n",
|
||||
" 'BATH AND NORTH EAST SOMERSET', 'BEDFORD', 'BEXLEY', 'BIRMINGHAM',\n",
|
||||
" 'BLACKBURN WITH DARWEN', 'BLACKPOOL', 'BLAENAU GWENT',\n",
|
||||
" ...\n",
|
||||
" 'WESTMINSTER', 'WIGAN', 'WILTSHIRE', 'WINDSOR AND MAIDENHEAD', 'WIRRAL',\n",
|
||||
" 'WOKINGHAM', 'WOLVERHAMPTON', 'WORCESTERSHIRE', 'WREXHAM', 'YORK'],\n",
|
||||
" dtype='object', length=171)"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"----------------------------------------\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"0 37\n",
|
||||
"1 37\n",
|
||||
"2 37\n",
|
||||
"3 37\n",
|
||||
"4 37\n",
|
||||
" ..\n",
|
||||
"58479889 96\n",
|
||||
"58479890 96\n",
|
||||
"58479891 96\n",
|
||||
"58479892 96\n",
|
||||
"58479893 96\n",
|
||||
"Length: 58479894, dtype: int16"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"display(df['county'].cat.categories)\n",
|
||||
"print('-'*40)\n",
|
||||
"display(df['county'].cat.codes)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "737385ab-677c-4bef-a86a-10aa3119e29a",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Note**: `.astype()` can also be used to convert data to `datetime` or `object` to enable datetime and string methods. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "552c47c2-0fbc-455e-8745-cb98fc777243",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Efficient Data Loading ##\n",
|
||||
"It is often advantageous to specify the most appropriate data types for each columns, based on range, precision requirement, and how they are used. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "c2b9f0c3-8598-4a28-9481-ce28fea7544b",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"Index 128\n",
|
||||
"age 467839152\n",
|
||||
"sex 3391833852\n",
|
||||
"county 3934985133\n",
|
||||
"lat 467839152\n",
|
||||
"long 467839152\n",
|
||||
"name 3666922374\n",
|
||||
"dtype: int64"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading 11.55 GB took 33.63 seconds.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"start=time.time()\n",
|
||||
"df=pd.read_csv('./data/uk_pop.csv')\n",
|
||||
"duration=time.time()-start\n",
|
||||
"\n",
|
||||
"mem_usage_df=df.memory_usage(deep=True)\n",
|
||||
"display(mem_usage_df)\n",
|
||||
"\n",
|
||||
"print(f'Loading {make_decimal(mem_usage_df.sum())} took {round(duration, 2)} seconds.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5729520e-3ed8-4ec6-ae1f-ba46d642f48d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Below we enable `cuda.pandas` to see the difference. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 15,
|
||||
"id": "99aa0f32-4d2a-43a7-bec1-f1b88bcc37c2",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"%load_ext cudf.pandas\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"import time"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"id": "2b724201-9ad1-4e9b-b712-f3b31bdc4104",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"suffixes = ['B', 'kB', 'MB', 'GB', 'TB', 'PB']\n",
|
||||
"def make_decimal(nbytes):\n",
|
||||
" i=0\n",
|
||||
" while nbytes >= 1024 and i < len(suffixes)-1:\n",
|
||||
" nbytes/=1024.\n",
|
||||
" i+=1\n",
|
||||
" f=('%.2f' % nbytes).rstrip('0').rstrip('.')\n",
|
||||
" return '%s %s' % (f, suffixes[i])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "99bdd7b0-8563-41db-bd8e-3a7279394ede",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"age 58479894\n",
|
||||
"sex 58479908\n",
|
||||
"county 58482446\n",
|
||||
"lat 467839152\n",
|
||||
"long 467839152\n",
|
||||
"name 117096917\n",
|
||||
"Index 0\n",
|
||||
"dtype: int64"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Loading 1.14 GB took 2.13 seconds.\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\"> </span>\n",
|
||||
"<span style=\"font-style: italic\"> Total time elapsed: 2.705 seconds </span>\n",
|
||||
"<span style=\"font-style: italic\"> </span>\n",
|
||||
"<span style=\"font-style: italic\"> Stats </span>\n",
|
||||
"<span style=\"font-style: italic\"> </span>\n",
|
||||
"┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n",
|
||||
"┃<span style=\"font-weight: bold\"> Line no. </span>┃<span style=\"font-weight: bold\"> Line </span>┃<span style=\"font-weight: bold\"> GPU TIME(s) </span>┃<span style=\"font-weight: bold\"> CPU TIME(s) </span>┃\n",
|
||||
"┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n",
|
||||
"│ 2 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> start</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">=</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">time</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">.</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">time()</span><span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ 5 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> dtype_dict</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">=</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">{</span><span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ 6 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'age'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">: </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'int8'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">, </span><span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ 7 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'sex'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">: </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'category'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">, </span><span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ 8 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'county'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">: </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'category'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">, </span><span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ 9 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'lat'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">: </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'float64'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">, </span><span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ 10 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'long'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">: </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'float64'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">, </span><span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ 11 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'name'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">: </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'category'</span><span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ 14 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> efficient_df</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">=</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">pd</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">.</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">read_csv(</span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'./data/uk_pop.csv'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">, dtype</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">=</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">dtype_dict)</span><span style=\"background-color: #272822\"> </span> │ 1.728013188 │ │\n",
|
||||
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ 15 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> duration</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">=</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">time</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">.</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">time()</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">-</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">start</span><span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ 17 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> mem_usage_df</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">=</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">efficient_df</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">.</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">memory_usage(</span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'deep'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">)</span><span style=\"background-color: #272822\"> </span> │ 0.005340174 │ │\n",
|
||||
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ 18 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> display(mem_usage_df)</span><span style=\"background-color: #272822\"> </span> │ 0.011073721 │ 0.006896915 │\n",
|
||||
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"│ 20 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> print(</span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">f'Loading {</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">make_decimal(mem_usage_df</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">.</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">sum())</span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">} took {</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">round(dura…</span> │ 0.004693074 │ │\n",
|
||||
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
|
||||
"└──────────┴──────────────────────────────────────────────────────────────────────────┴─────────────┴─────────────┘\n",
|
||||
"</pre>\n"
|
||||
],
|
||||
"text/plain": [
|
||||
"\u001b[3m \u001b[0m\n",
|
||||
"\u001b[3m Total time elapsed: 2.705 seconds \u001b[0m\n",
|
||||
"\u001b[3m \u001b[0m\n",
|
||||
"\u001b[3m Stats \u001b[0m\n",
|
||||
"\u001b[3m \u001b[0m\n",
|
||||
"┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n",
|
||||
"┃\u001b[1m \u001b[0m\u001b[1mLine no.\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mLine \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mGPU TIME(s)\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mCPU TIME(s)\u001b[0m\u001b[1m \u001b[0m┃\n",
|
||||
"┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n",
|
||||
"│ 2 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mstart\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m=\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mtime\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m.\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mtime\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m)\u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ 5 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mdtype_dict\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m=\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m{\u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ 6 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mage\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m:\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mint8\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m,\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ 7 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34msex\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m:\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mcategory\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m,\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ 8 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mcounty\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m:\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mcategory\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m,\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ 9 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mlat\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m:\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mfloat64\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m,\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ 10 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mlong\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m:\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mfloat64\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m,\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ 11 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mname\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m:\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mcategory\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ 14 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mefficient_df\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m=\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mpd\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m.\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mread_csv\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m./data/uk_pop.csv\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m,\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mdtype\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m=\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mdtype_dict\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m)\u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ 1.728013188 │ │\n",
|
||||
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ 15 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mduration\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m=\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mtime\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m.\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mtime\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m)\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m-\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mstart\u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ 17 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mmem_usage_df\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m=\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mefficient_df\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m.\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mmemory_usage\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mdeep\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m)\u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ 0.005340174 │ │\n",
|
||||
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ 18 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mdisplay\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mmem_usage_df\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m)\u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ 0.011073721 │ 0.006896915 │\n",
|
||||
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"│ 20 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mprint\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mf\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mLoading \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m{\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mmake_decimal\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mmem_usage_df\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m.\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34msum\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m)\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m)\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m}\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m took \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m{\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mround\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mdura…\u001b[0m │ 0.004693074 │ │\n",
|
||||
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
|
||||
"└──────────┴──────────────────────────────────────────────────────────────────────────┴─────────────┴─────────────┘\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%cudf.pandas.line_profile\n",
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"start=time.time()\n",
|
||||
"\n",
|
||||
"# define data types for each column\n",
|
||||
"dtype_dict={\n",
|
||||
" 'age': 'int8', \n",
|
||||
" 'sex': 'category', \n",
|
||||
" 'county': 'category', \n",
|
||||
" 'lat': 'float64', \n",
|
||||
" 'long': 'float64', \n",
|
||||
" 'name': 'category'\n",
|
||||
"}\n",
|
||||
" \n",
|
||||
"efficient_df=pd.read_csv('./data/uk_pop.csv', dtype=dtype_dict)\n",
|
||||
"duration=time.time()-start\n",
|
||||
"\n",
|
||||
"mem_usage_df=efficient_df.memory_usage('deep')\n",
|
||||
"display(mem_usage_df)\n",
|
||||
"\n",
|
||||
"print(f'Loading {make_decimal(mem_usage_df.sum())} took {round(duration, 2)} seconds.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0f4607d8-6de3-4b27-96d4-a9720d268333",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"We were able to load data faster and more efficiently. \n",
|
||||
"\n",
|
||||
"**Note**: Notice that the memory utilized on the GPU is larger than the memory used by the DataFrame. This is expected because there are intermediary processes that use some memory during the data loading process, specifically related to parsing the csv file in this case. \n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"+-----------------------------------------------------------------------------+\n",
|
||||
"| NVIDIA-SMI 525.60.13 Driver Version: 525.60.13 CUDA Version: 12.0 |\n",
|
||||
"|-------------------------------+----------------------+----------------------+\n",
|
||||
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
|
||||
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
|
||||
"| | | MIG M. |\n",
|
||||
"|===============================+======================+======================|\n",
|
||||
"| 0 Tesla T4 Off | 00000000:00:1B.0 Off | 0 |\n",
|
||||
"| N/A 32C P0 26W / 70W | 1378MiB / 15360MiB | 0% Default |\n",
|
||||
"| | | N/A |\n",
|
||||
"+-------------------------------+----------------------+----------------------+\n",
|
||||
"| 1 Tesla T4 Off | 00000000:00:1C.0 Off | 0 |\n",
|
||||
"| N/A 31C P0 26W / 70W | 168MiB / 15360MiB | 0% Default |\n",
|
||||
"| | | N/A |\n",
|
||||
"+-------------------------------+----------------------+----------------------+\n",
|
||||
"| 2 Tesla T4 Off | 00000000:00:1D.0 Off | 0 |\n",
|
||||
"| N/A 30C P0 26W / 70W | 168MiB / 15360MiB | 0% Default |\n",
|
||||
"| | | N/A |\n",
|
||||
"+-------------------------------+----------------------+----------------------+\n",
|
||||
"| 3 Tesla T4 Off | 00000000:00:1E.0 Off | 0 |\n",
|
||||
"| N/A 30C P0 26W / 70W | 168MiB / 15360MiB | 0% Default |\n",
|
||||
"| | | N/A |\n",
|
||||
"+-------------------------------+----------------------+----------------------+\n",
|
||||
" \n",
|
||||
"+-----------------------------------------------------------------------------+\n",
|
||||
"| Processes: |\n",
|
||||
"| GPU GI CI PID Type Process name GPU Memory |\n",
|
||||
"| ID ID Usage |\n",
|
||||
"|=============================================================================|\n",
|
||||
"+-----------------------------------------------------------------------------+\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 18,
|
||||
"id": "92f7ee37-4acb-46aa-bb73-4c0139d3f6b8",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Tue Oct 21 08:08:25 2025 \n",
|
||||
"+-----------------------------------------------------------------------------+\n",
|
||||
"| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 |\n",
|
||||
"|-------------------------------+----------------------+----------------------+\n",
|
||||
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
|
||||
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
|
||||
"| | | MIG M. |\n",
|
||||
"|===============================+======================+======================|\n",
|
||||
"| 0 Tesla T4 On | 00000000:00:1B.0 Off | 0 |\n",
|
||||
"| N/A 28C P0 24W / 70W | 11314MiB / 15360MiB | 0% Default |\n",
|
||||
"| | | N/A |\n",
|
||||
"+-------------------------------+----------------------+----------------------+\n",
|
||||
"| 1 Tesla T4 On | 00000000:00:1C.0 Off | 0 |\n",
|
||||
"| N/A 29C P0 25W / 70W | 168MiB / 15360MiB | 0% Default |\n",
|
||||
"| | | N/A |\n",
|
||||
"+-------------------------------+----------------------+----------------------+\n",
|
||||
"| 2 Tesla T4 On | 00000000:00:1D.0 Off | 0 |\n",
|
||||
"| N/A 28C P0 25W / 70W | 168MiB / 15360MiB | 0% Default |\n",
|
||||
"| | | N/A |\n",
|
||||
"+-------------------------------+----------------------+----------------------+\n",
|
||||
"| 3 Tesla T4 On | 00000000:00:1E.0 Off | 0 |\n",
|
||||
"| N/A 29C P0 24W / 70W | 168MiB / 15360MiB | 0% Default |\n",
|
||||
"| | | N/A |\n",
|
||||
"+-------------------------------+----------------------+----------------------+\n",
|
||||
" \n",
|
||||
"+-----------------------------------------------------------------------------+\n",
|
||||
"| Processes: |\n",
|
||||
"| GPU GI CI PID Type Process name GPU Memory |\n",
|
||||
"| ID ID Usage |\n",
|
||||
"|=============================================================================|\n",
|
||||
"+-----------------------------------------------------------------------------+\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"!nvidia-smi"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c031d2c7-03cb-4ac7-a195-70fc25cb191d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"When loading data this way, we may be able to fit more data. The optimal dataset size depends on various factors including the specific operations being performed, the complexity of the workload, and the available GPU memory. To maximize acceleration, datasets should ideally fit within GPU memory, with ample space left for operations that can spike memory requirements. As a general rule of thumb, cuDF recommends data sets that are less than 50% of the GPU memory capacity. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ec6cefea-dc64-4f13-815e-081cd35651b9",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"# 1 gigabytes = 1073741824 bytes\n",
|
||||
"mem_capacity=16*1073741824\n",
|
||||
"\n",
|
||||
"mem_per_record=mem_usage_df.sum()/len(efficient_df)\n",
|
||||
"\n",
|
||||
"print(f'We can load {int(mem_capacity/2/mem_per_record)} rows.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "ddaaa1ac-66ec-4323-9842-2543c6d85e4e",
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"import IPython\n",
|
||||
"app = IPython.Application.instance()\n",
|
||||
"app.kernel.do_shutdown(True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "658e9847-775f-4d12-af4e-8f896df4e6fe",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Well Done!** Let's move to the [next notebook](1-04_interoperability.ipynb). "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b86451cf-60e6-4733-b431-1bc0bd586bc2",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"./images/DLI_Header.png\" width=400/>"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
1269
5/data science/2/1-04_interoperability.ipynb
Normal file
1088
5/data science/2/1-05_grouping.ipynb
Normal file
2259
5/data science/2/1-06_data_visualization.ipynb
Normal file
1123
5/data science/2/1-07_etl.ipynb
Normal file
1220
5/data science/2/1-08_cudf-polars.ipynb
Normal file
712
5/data science/2/1-09_dask-cudf.ipynb
Normal file
1559
5/data science/2/2-02_prep_graph.ipynb
Normal file
1706
5/data science/2/2-03_cugraph.ipynb
Normal file
669
5/data science/2/2-04_networkx_cugraph.ipynb
Normal file
@ -0,0 +1,669 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2d190a78-7253-4fad-9d9c-6b4fb33c8bf2",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"<img src=\"./images/DLI_Header.png\" width=400/>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8a2c4abf-6278-4edd-83f8-f0afac4c834f",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Fundamentals of Accelerated Data Science #"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e1e78ef4-c0de-433e-8616-bd946f69d30e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 04 - cuGraph as a NetworkX backend ##"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0828e0b4-7935-4b77-95ef-e06b72f0319e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Table of Contents**\n",
|
||||
"<br>\n",
|
||||
"This notebook introduces the various methods of utilizing the cuGraph backend for NetworkX and runs centrality algorithms on the dataset. This notebook covers the below sections:\n",
|
||||
"1. [Background](#Background)\n",
|
||||
"2. [Installation](#Installation)\n",
|
||||
"3. [Utilizing nx-cugraph](#Utilizing-nx-cugraph)\n",
|
||||
" * [Runtime Environment Variable](#Runtime-Environment-Variable)\n",
|
||||
" * [Backend Keyword Argument](#Backend-Keyword-Argument)\n",
|
||||
" * [Type-Based Dispatching](#Type-Based-Dispatching)\n",
|
||||
"4. [Computing Centrality](#Computing-Centrality)\n",
|
||||
" * [Creating Graph](#Creating-Graph)\n",
|
||||
" * [Running Centrality Algorithms](#Running-Centrality-Algorithms)\n",
|
||||
" * [Betweenness Centrality](#Betweenness-Centrality)\n",
|
||||
" * [Degree Centrality](#Degree-Centrality)\n",
|
||||
" * [Katz Centrality](#Katz-Centrality)\n",
|
||||
" * [Pagerank Centrality](#Pagerank-Centrality)\n",
|
||||
" * [Eigenvector Centrality](#Eigenvector-Centrality)\n",
|
||||
" * [Visualize Results](#Visualize-Results)\n",
|
||||
" * [Exercise #1 - Type Dispatch](#Exercise-#1---Type-Dispatch)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c57b79ba-c7c7-49d2-9e21-c388bbe6ca98",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Background ##\n",
|
||||
"RAPIDS recently introduced a new backend to NetworkX called nx-cugraph. With this backend, you can automatically accelerate supported algorithms. In this notebook, we will cover the various methods of enabling the cugraph backend, and use the backend to run different centrality algorithms."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "697ea4c9-b416-43d5-9d2c-28aa41ef2561",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Installation ##\n",
|
||||
"We have already prepared the environment with nx-cugraph installed. When you are using your own environment, below is the command for installation. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"id": "2fe07200-4f66-4604-9950-40ade1938f4c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"pip install nx-cugraph-cu12 --no-deps --extra-index-url https://pypi.anaconda.org/rapidsai-wheels-nightly/simple"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a9ea09f4-6c93-4785-bcc3-44c6f040dfc6",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Utilizing nx-cugraph ##\n",
|
||||
"There are 3 ways to utilize nx-cugraph\n",
|
||||
"\n",
|
||||
"1. **Environment Variable at Runtime**\n",
|
||||
"2. **Backend keyword argument**\n",
|
||||
"3. **Type-Based dispatching**\n",
|
||||
"\n",
|
||||
"Let's dig a little deeper in to each of these methods."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8b4322fd-9f56-4cbc-a00c-8fac4b2b2fe1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Runtime Environment Variable ###\n",
|
||||
"The NETWORKX_AUTOMATIC_BACKENDS environment variable can be used to have NetworkX automatically dispatch to specified backends. Set NETWORKX_AUTOMATIC_BACKENDS=cugraph to use nx-cugraph to GPU accelerate supported APIs with no code changes. We will also be loading the cuDF pandas module to accelerate csv loading."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "b41fef7f-5d43-4481-98a7-d9f3cb54066c",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"!NETWORKX_AUTOMATIC_BACKENDS=cugraph python -m cudf.pandas scripts/networkx.py"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5ffb6c4b-a03a-4bfb-9b92-14c59e6dcd75",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Backend Keyword Argument ###\n",
|
||||
"NetworkX also supports explicitly specifying a particular backend for supported APIs with the backend= keyword argument. This argument takes precedence over the NETWORKX_AUTOMATIC_BACKENDS environment variable. This method also requires that the specified backend already be installed."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "8183ecc7-8544-4914-8c07-c904ba12225a",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import warnings\n",
|
||||
"warnings.filterwarnings('ignore')\n",
|
||||
"\n",
|
||||
"%load_ext cudf.pandas\n",
|
||||
"import networkx as nx\n",
|
||||
"import pandas as pd\n",
|
||||
"import matplotlib.pyplot as plt\n",
|
||||
"\n",
|
||||
"# Load the CSV file\n",
|
||||
"road_graph = pd.read_csv('./data/road_graph.csv', dtype=['int32', 'int32', 'float32'], nrows=1000)\n",
|
||||
"\n",
|
||||
"# Create an empty graph\n",
|
||||
"G = nx.from_pandas_edgelist(road_graph, source='src', target='dst', edge_attr='length')\n",
|
||||
"b = nx.betweenness_centrality(G, k=1000, backend=\"cugraph\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "e588aa65-6281-4c19-a51c-42f044636ac0",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Type-Based Dispatching ###\n",
|
||||
"For users wanting to ensure a particular behavior, without the potential for runtime conversions, NetworkX offers type-based dispatching. To utilize this method, users must import the desired backend and create a Graph instance for it."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "5fea9300-8d75-443a-9ec0-ee65c8ccaf0f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import networkx as nx\n",
|
||||
"import nx_cugraph as nxcg\n",
|
||||
"\n",
|
||||
"# Loading data from previous cell\n",
|
||||
"G = nx.from_pandas_edgelist(road_graph, source='src', target='dst', edge_attr='length') \n",
|
||||
"\n",
|
||||
"nxcg_G = nxcg.from_networkx(G) # conversion happens once here\n",
|
||||
"b = nx.betweenness_centrality(nxcg_G, k=1000) # nxcg Graph type causes cugraph backend to be used, no conversion necessary"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "cb5a17e1-d886-4d20-8d4b-ce900280279c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Computing Centrality ##\n",
|
||||
"Now that we learned how to enable nx-cugraph, let's try to use it in a workflow! We will be using the backend argument for this example. First let's create a graph."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "19bea37c-bccf-4815-81bd-aa1de553812d",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Creating Graph ###"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "2b4420d7-7c89-4914-809f-4e323a12f47f",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# Create a graph from already loaded dataframe\n",
|
||||
"G = nx.from_pandas_edgelist(road_graph, source='src', target='dst', edge_attr='length')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7dc1ad5b-8454-4277-9568-0cdacbebd9f1",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Running Centrality Algorithms ###\n",
|
||||
"Now, let's run the various centrality algorithms!"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1c52b7b3-6c23-45be-9ace-34a667f132aa",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Betweenness Centrality ###\n",
|
||||
"Quantifies the number of times a node acts as a bridge along the shortest path between two other nodes, highlighting its importance in information flow"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "281374af-c7cf-4592-a34d-796c1158dab6",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"b = nx.betweenness_centrality(G, backend=\"cugraph\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f98b2975-1f72-4bff-83c7-ace7aab65d98",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Degree Centrality ###\n",
|
||||
"Measures the number of direct connections a node has, indicating how well-connected it is within the network"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "3e0c4460-6d25-4a2b-8b8f-8f8c6ef617b0",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"d = nx.degree_centrality(G, backend=\"cugraph\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0665a659-16b1-48b4-b3bb-9aa5659ef91c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Katz Centrality ###\n",
|
||||
"Measures a node's centrality based on its global influence in the network, considering both direct and indirect connections"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "8ce418d2-9eda-40bc-9733-b82d8d7556b1",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"k = nx.katz_centrality(G, backend=\"cugraph\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0712cedb-87ba-4a08-a74d-24997d02a636",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Pagerank Centrality ###\n",
|
||||
"Determines a node's importance based on the quantity and quality of links to it, similar to Google's original PageRank algorithm"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "a17ee15b-8758-484b-82b9-a158187231c5",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"p = nx.pagerank(G, max_iter=10, tol=1.0e-3, backend=\"cugraph\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c5f57a5e-95e4-47f7-a9ec-04a99fa2c1dc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Eigenvector Centrality ###\n",
|
||||
"Assigns scores to nodes based on the principle that connections to high-scoring nodes contribute more to the node's own score than connections to low-scoring nodes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "3eb1e358-ae8e-4399-bf45-90616b663e9d",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"e = nx.eigenvector_centrality(G, max_iter=1000, tol=1.0e-3, backend=\"cugraph\")"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0bc9178c-e66a-4c75-bf91-0c5d668b5634",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Visualize Results ###\n",
|
||||
"Now let's visualize results! We will only display the top 5 rows for readibility. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "69b6c23d-78a0-4dbb-be19-913ad180fe94",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<style type=\"text/css\">\n",
|
||||
"</style>\n",
|
||||
"<table id=\"T_9f2bb\" style='display:inline'>\n",
|
||||
" <caption>Degree</caption>\n",
|
||||
" <thead>\n",
|
||||
" <tr>\n",
|
||||
" <th id=\"T_9f2bb_level0_col0\" class=\"col_heading level0 col0\" >vertex</th>\n",
|
||||
" <th id=\"T_9f2bb_level0_col1\" class=\"col_heading level0 col1\" >degree_centrality</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_9f2bb_row0_col0\" class=\"data row0 col0\" >24</td>\n",
|
||||
" <td id=\"T_9f2bb_row0_col1\" class=\"data row0 col1\" >0.002847</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_9f2bb_row1_col0\" class=\"data row1 col0\" >72</td>\n",
|
||||
" <td id=\"T_9f2bb_row1_col1\" class=\"data row1 col1\" >0.002847</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_9f2bb_row2_col0\" class=\"data row2 col0\" >86</td>\n",
|
||||
" <td id=\"T_9f2bb_row2_col1\" class=\"data row2 col1\" >0.002847</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_9f2bb_row3_col0\" class=\"data row3 col0\" >127</td>\n",
|
||||
" <td id=\"T_9f2bb_row3_col1\" class=\"data row3 col1\" >0.002847</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_9f2bb_row4_col0\" class=\"data row4 col0\" >133</td>\n",
|
||||
" <td id=\"T_9f2bb_row4_col1\" class=\"data row4 col1\" >0.002847</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<style type=\"text/css\">\n",
|
||||
"</style>\n",
|
||||
"<table id=\"T_c13b0\" style='display:inline'>\n",
|
||||
" <caption>Betweenness</caption>\n",
|
||||
" <thead>\n",
|
||||
" <tr>\n",
|
||||
" <th id=\"T_c13b0_level0_col0\" class=\"col_heading level0 col0\" >vertex</th>\n",
|
||||
" <th id=\"T_c13b0_level0_col1\" class=\"col_heading level0 col1\" >betweenness_centrality</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_c13b0_row0_col0\" class=\"data row0 col0\" >222</td>\n",
|
||||
" <td id=\"T_c13b0_row0_col1\" class=\"data row0 col1\" >0.000007</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_c13b0_row1_col0\" class=\"data row1 col0\" >381</td>\n",
|
||||
" <td id=\"T_c13b0_row1_col1\" class=\"data row1 col1\" >0.000007</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_c13b0_row2_col0\" class=\"data row2 col0\" >24</td>\n",
|
||||
" <td id=\"T_c13b0_row2_col1\" class=\"data row2 col1\" >0.000006</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_c13b0_row3_col0\" class=\"data row3 col0\" >72</td>\n",
|
||||
" <td id=\"T_c13b0_row3_col1\" class=\"data row3 col1\" >0.000006</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_c13b0_row4_col0\" class=\"data row4 col0\" >86</td>\n",
|
||||
" <td id=\"T_c13b0_row4_col1\" class=\"data row4 col1\" >0.000006</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<style type=\"text/css\">\n",
|
||||
"</style>\n",
|
||||
"<table id=\"T_afb59\" style='display:inline'>\n",
|
||||
" <caption>Katz</caption>\n",
|
||||
" <thead>\n",
|
||||
" <tr>\n",
|
||||
" <th id=\"T_afb59_level0_col0\" class=\"col_heading level0 col0\" >vertex</th>\n",
|
||||
" <th id=\"T_afb59_level0_col1\" class=\"col_heading level0 col1\" >katz_centrality</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_afb59_row0_col0\" class=\"data row0 col0\" >24</td>\n",
|
||||
" <td id=\"T_afb59_row0_col1\" class=\"data row0 col1\" >0.033058</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_afb59_row1_col0\" class=\"data row1 col0\" >72</td>\n",
|
||||
" <td id=\"T_afb59_row1_col1\" class=\"data row1 col1\" >0.033058</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_afb59_row2_col0\" class=\"data row2 col0\" >86</td>\n",
|
||||
" <td id=\"T_afb59_row2_col1\" class=\"data row2 col1\" >0.033058</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_afb59_row3_col0\" class=\"data row3 col0\" >127</td>\n",
|
||||
" <td id=\"T_afb59_row3_col1\" class=\"data row3 col1\" >0.033058</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_afb59_row4_col0\" class=\"data row4 col0\" >133</td>\n",
|
||||
" <td id=\"T_afb59_row4_col1\" class=\"data row4 col1\" >0.033058</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<style type=\"text/css\">\n",
|
||||
"</style>\n",
|
||||
"<table id=\"T_bb8df\" style='display:inline'>\n",
|
||||
" <caption>PageRank</caption>\n",
|
||||
" <thead>\n",
|
||||
" <tr>\n",
|
||||
" <th id=\"T_bb8df_level0_col0\" class=\"col_heading level0 col0\" >vertex</th>\n",
|
||||
" <th id=\"T_bb8df_level0_col1\" class=\"col_heading level0 col1\" >pagerank</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_bb8df_row0_col0\" class=\"data row0 col0\" >24</td>\n",
|
||||
" <td id=\"T_bb8df_row0_col1\" class=\"data row0 col1\" >0.002525</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_bb8df_row1_col0\" class=\"data row1 col0\" >72</td>\n",
|
||||
" <td id=\"T_bb8df_row1_col1\" class=\"data row1 col1\" >0.002525</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_bb8df_row2_col0\" class=\"data row2 col0\" >86</td>\n",
|
||||
" <td id=\"T_bb8df_row2_col1\" class=\"data row2 col1\" >0.002525</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_bb8df_row3_col0\" class=\"data row3 col0\" >127</td>\n",
|
||||
" <td id=\"T_bb8df_row3_col1\" class=\"data row3 col1\" >0.002525</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_bb8df_row4_col0\" class=\"data row4 col0\" >133</td>\n",
|
||||
" <td id=\"T_bb8df_row4_col1\" class=\"data row4 col1\" >0.002525</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"<style type=\"text/css\">\n",
|
||||
"</style>\n",
|
||||
"<table id=\"T_f5314\" style='display:inline'>\n",
|
||||
" <caption>EigenVector</caption>\n",
|
||||
" <thead>\n",
|
||||
" <tr>\n",
|
||||
" <th id=\"T_f5314_level0_col0\" class=\"col_heading level0 col0\" >vertex</th>\n",
|
||||
" <th id=\"T_f5314_level0_col1\" class=\"col_heading level0 col1\" >eigenvector_centrality</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_f5314_row0_col0\" class=\"data row0 col0\" >24</td>\n",
|
||||
" <td id=\"T_f5314_row0_col1\" class=\"data row0 col1\" >0.064086</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_f5314_row1_col0\" class=\"data row1 col0\" >72</td>\n",
|
||||
" <td id=\"T_f5314_row1_col1\" class=\"data row1 col1\" >0.064086</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_f5314_row2_col0\" class=\"data row2 col0\" >86</td>\n",
|
||||
" <td id=\"T_f5314_row2_col1\" class=\"data row2 col1\" >0.064086</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_f5314_row3_col0\" class=\"data row3 col0\" >127</td>\n",
|
||||
" <td id=\"T_f5314_row3_col1\" class=\"data row3 col1\" >0.064086</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <td id=\"T_f5314_row4_col0\" class=\"data row4 col0\" >133</td>\n",
|
||||
" <td id=\"T_f5314_row4_col1\" class=\"data row4 col1\" >0.064086</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n"
|
||||
]
|
||||
},
|
||||
"metadata": {},
|
||||
"output_type": "display_data"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"from IPython.display import display_html\n",
|
||||
"dc_top = pd.DataFrame(sorted(d.items(), key=lambda x:x[1], reverse=True)[:5], columns=[\"vertex\", \"degree_centrality\"])\n",
|
||||
"bc_top = pd.DataFrame(sorted(b.items(), key=lambda x:x[1], reverse=True)[:5], columns=[\"vertex\", \"betweenness_centrality\"])\n",
|
||||
"katz_top = pd.DataFrame(sorted(k.items(), key=lambda x:x[1], reverse=True)[:5], columns=[\"vertex\", \"katz_centrality\"])\n",
|
||||
"pr_top = pd.DataFrame(sorted(p.items(), key=lambda x:x[1], reverse=True)[:5], columns=[\"vertex\", \"pagerank\"])\n",
|
||||
"ev_top = pd.DataFrame(sorted(e.items(), key=lambda x:x[1], reverse=True)[:5], columns=[\"vertex\", \"eigenvector_centrality\"])\n",
|
||||
"\n",
|
||||
"df1_styler = dc_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Degree').hide(axis='index')\n",
|
||||
"df2_styler = bc_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Betweenness').hide(axis='index')\n",
|
||||
"df3_styler = katz_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Katz').hide(axis='index')\n",
|
||||
"df4_styler = pr_top.style.set_table_attributes(\"style='display:inline'\").set_caption('PageRank').hide(axis='index')\n",
|
||||
"df5_styler = ev_top.style.set_table_attributes(\"style='display:inline'\").set_caption('EigenVector').hide(axis='index')\n",
|
||||
"\n",
|
||||
"display_html(df1_styler._repr_html_()+df2_styler._repr_html_()+df3_styler._repr_html_()+df4_styler._repr_html_()+df5_styler._repr_html_(), raw=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "1a653ca9-9448-4ba5-85b2-f6c885c273a9",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Exercise #1 - Type Dispatch ###\n",
|
||||
"Use the type dispatching method to obtain pagerank centrality results with the cugraph backend."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "6eb90078-1479-4847-97b7-eb119e9d5478",
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Graph with 1406 nodes and 999 edges\n",
|
||||
"CudaGraph with 1406 nodes and 999 edges\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>vertex</th>\n",
|
||||
" <th>pagerank</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>24</td>\n",
|
||||
" <td>0.002525</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>72</td>\n",
|
||||
" <td>0.002525</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>86</td>\n",
|
||||
" <td>0.002525</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>127</td>\n",
|
||||
" <td>0.002525</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>133</td>\n",
|
||||
" <td>0.002525</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" vertex pagerank\n",
|
||||
"0 24 0.002525\n",
|
||||
"1 72 0.002525\n",
|
||||
"2 86 0.002525\n",
|
||||
"3 127 0.002525\n",
|
||||
"4 133 0.002525"
|
||||
]
|
||||
},
|
||||
"execution_count": 14,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import networkx as nx\n",
|
||||
"import nx_cugraph as nxcg\n",
|
||||
"\n",
|
||||
"G = nx.from_pandas_edgelist(road_graph, source='src', target='dst', edge_attr='length')\n",
|
||||
"nxcg_G = nxcg.from_networkx(G)\n",
|
||||
"p = nx.pagerank(nxcg_G, max_iter=10, tol=1.0e-3)\n",
|
||||
"\n",
|
||||
"print(G)\n",
|
||||
"print(nxcg_G)\n",
|
||||
"\n",
|
||||
"pd.DataFrame(sorted(p.items(), key=lambda x:x[1], reverse=True)[:5], columns=[\"vertex\", \"pagerank\"])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "d70c78b7-551d-4d9e-b428-32b26adcd3c4",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"app = IPython.Application.instance()\n",
|
||||
"app.kernel.do_shutdown(True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2279fdf1-82c0-4c6e-ac8e-b952f4777562",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Well Done!** "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "3fbc12b2-585c-48a9-a176-b2572040d378",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"<img src=\"./images/DLI_Header.png\" width=400/>"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
426
5/data science/2/3-02_k-means.ipynb
Normal file
@ -0,0 +1,426 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"./images/DLI_Header.png\" width=400/>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Fundamentals of Accelerated Data Science # "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 02 - K-Means ##\n",
|
||||
"\n",
|
||||
"**Table of Contents**\n",
|
||||
"<br>\n",
|
||||
"This notebook uses GPU-accelerated K-means to find the best locations for a fixed number of humanitarian supply airdrop depots. This notebook covers the below sections: \n",
|
||||
"1. [Environment](#Environment)\n",
|
||||
"2. [Load Data](#Load-Data)\n",
|
||||
"3. [K-Means Clustering](#K-Means-Clustering)\n",
|
||||
" * [Exercise #1 - Make Another `KMeans` Instance](#Exercise-#1---Make-Another-KMeans-Instance)\n",
|
||||
"4. [Visualize the Clusters](#Visualize-the-Clusters)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Environment ##\n",
|
||||
"For the first time we import `cuml`, the RAPIDS GPU-accelerated library containing many common machine learning algorithms. We will be visualizing the results of your work in this notebook, so we also import `cuxfilter`."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"import cudf\n",
|
||||
"import cuml\n",
|
||||
"\n",
|
||||
"import cuxfilter as cxf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Data ##\n",
|
||||
"For this notebook we load again the cleaned UK population data--in this case, we are not specifically looking at counties, so we omit that column and just keep the grid coordinate columns."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"northing float64\n",
|
||||
"easting float64\n",
|
||||
"dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(58479894, 2)"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"gdf = cudf.read_csv('./data/clean_uk_pop.csv', usecols=['easting', 'northing'])\n",
|
||||
"print(gdf.dtypes)\n",
|
||||
"gdf.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>northing</th>\n",
|
||||
" <th>easting</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>515491.5313</td>\n",
|
||||
" <td>430772.1875</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>503572.4688</td>\n",
|
||||
" <td>434685.8750</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>517903.6563</td>\n",
|
||||
" <td>432565.5313</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>517059.9063</td>\n",
|
||||
" <td>427660.6250</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>509228.6875</td>\n",
|
||||
" <td>425527.7813</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" northing easting\n",
|
||||
"0 515491.5313 430772.1875\n",
|
||||
"1 503572.4688 434685.8750\n",
|
||||
"2 517903.6563 432565.5313\n",
|
||||
"3 517059.9063 427660.6250\n",
|
||||
"4 509228.6875 425527.7813"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"gdf.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a name='#s2-3'></a>\n",
|
||||
"## K-Means Clustering ##\n",
|
||||
"The unsupervised K-means clustering algorithm will look for a fixed number *k* of centroids in the data and clusters each point with its closest centroid. K-means can be effective when the number of clusters *k* is known or has a good estimate (such as from a model of the underlying mechanics of a problem).\n",
|
||||
"\n",
|
||||
"Assume that in addition to knowing the distribution of the population, which we do, we would like to estimate the best locations to build a fixed number of humanitarian supply depots from which we can perform airdrops and reach the population most efficiently. We can use K-means, setting *k* to the number of supply depots available and fitting on the locations of the population, to identify candidate locations.\n",
|
||||
"\n",
|
||||
"GPU-accelerated K-means is just as easy as its CPU-only scikit-learn counterpart. In this series of exercises, you will use it to optimize the locations for 5 supply depots.\n",
|
||||
"\n",
|
||||
"`cuml.KMeans()` will initialize a K-means instance. Use it now to initialize a K-means instance called `km`, passing the named argument `n_clusters` set equal to our desired number `5`. Use the `km.fit` method to fit `km` to the population's locations by passing it the population data. After fitting, add the cluster labels back to the `gdf` in a new column named `cluster`. Finally, you can use `km.cluster_centers_` to see where the algorithm created the 5 centroids.\n",
|
||||
"\n",
|
||||
"Below we train a K-means clustering algorithm to find 5 clusters. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>0</th>\n",
|
||||
" <th>1</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>306647.898235</td>\n",
|
||||
" <td>408370.452191</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>442109.465392</td>\n",
|
||||
" <td>402673.747673</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>288997.149971</td>\n",
|
||||
" <td>553805.430444</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>148770.463641</td>\n",
|
||||
" <td>311786.805381</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>170553.110214</td>\n",
|
||||
" <td>521605.459724</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" 0 1\n",
|
||||
"0 306647.898235 408370.452191\n",
|
||||
"1 442109.465392 402673.747673\n",
|
||||
"2 288997.149971 553805.430444\n",
|
||||
"3 148770.463641 311786.805381\n",
|
||||
"4 170553.110214 521605.459724"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"# instantaite\n",
|
||||
"km = cuml.KMeans(n_clusters=5)\n",
|
||||
"\n",
|
||||
"# fit\n",
|
||||
"km.fit(gdf)\n",
|
||||
"\n",
|
||||
"# assign cluster as new column\n",
|
||||
"gdf['cluster'] = km.labels_\n",
|
||||
"km.cluster_centers_"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a name='#s2-e1'></a>\n",
|
||||
"## Exercise #1 - Make Another `KMeans` Instance ##\n",
|
||||
"\n",
|
||||
"**Instructions**: <br>\n",
|
||||
"* Modify the `<FIXME>` only and execute the below cell to instantiate a K-means instance with 6 clusters.\n",
|
||||
"* Modify the `<FIXME>` only and execute the cell below to fit the data. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"km = cuml.KMeans(n_clusters=6)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"km.fit(gdf)\n",
|
||||
"gdf['cluster'] = km.labels_\n",
|
||||
"km.cluster_centers_"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": true
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"\n",
|
||||
"km = cuml.KMeans(n_clusters=6)\n",
|
||||
"\n",
|
||||
"km.fit(gdf)\n",
|
||||
"gdf['cluster'] = km.labels_\n",
|
||||
"km.cluster_centers_"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Click ... for solution. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a id='#s2-4'></a>\n",
|
||||
"## Visualize the Clusters ##\n",
|
||||
"To help us understand where clusters are located, we make a visualization that separates them, using the same three steps as before.\n",
|
||||
"\n",
|
||||
"Below we plot the clusters with cuxfilter. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# DO NOT CHANGE THIS CELL\n",
|
||||
"# associate a data source with cuXfilter\n",
|
||||
"cxf_data = cxf.DataFrame.from_dataframe(gdf)\n",
|
||||
"\n",
|
||||
"# define charts\n",
|
||||
"scatter_chart = cxf.charts.datashader.scatter(x='easting', y='northing')\n",
|
||||
"\n",
|
||||
"# define widget using the `cluster` column for multiselect\n",
|
||||
"# use the same technique to scale the scatterplot, then add a widget to let us select which cluster to look at\n",
|
||||
"cluster_widget = cxf.charts.panel_widgets.multi_select('cluster')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# create dashboard\n",
|
||||
"dash = cxf_data.dashboard(charts=[scatter_chart],sidebar=[cluster_widget], theme=cxf.themes.dark, data_size_widget=True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dash.app()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"app = IPython.Application.instance()\n",
|
||||
"app.kernel.do_shutdown(True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Well Done!** Let's move to the [next notebook](3-03_dbscan.ipynb). "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"./images/DLI_Header.png\" width=400/>"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
373
5/data science/2/3-03_dbscan.ipynb
Normal file
@ -0,0 +1,373 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"./images/DLI_Header.png\" width=400/>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Fundamentals of Accelerated Data Science # "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 03 - DBSCAN ##\n",
|
||||
"\n",
|
||||
"**Table of Contents**\n",
|
||||
"<br>\n",
|
||||
"This notebook uses GPU-accelerated DBSCAN to identify clusters of infected people. This notebook covers the below sections: \n",
|
||||
"1. [Environment](#Environment)\n",
|
||||
"2. [Load Data](#Load-Data)\n",
|
||||
"3. [DBSCAN Clustering](#DBSCAN-Clustering)\n",
|
||||
" * [Exercise #1 - Make Another DBSCAN Instance](#Exercise-#1---Make-Another-DBSCAN-Instance)\n",
|
||||
"4. [Visualize the Clusters](#Visualize-the-Clusters)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Environment ##"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import cudf\n",
|
||||
"import cuml\n",
|
||||
"\n",
|
||||
"import cuxfilter as cxf"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Data ##\n",
|
||||
"For this notebook, we again load a subset of our population data with only the columns we need. An `infected` column has been added to the data to indicate whether or not a person is known to be infected with our simulated virus."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"northing float32\n",
|
||||
"easting float32\n",
|
||||
"infected float32\n",
|
||||
"dtype: object\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"(1000000, 3)"
|
||||
]
|
||||
},
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"gdf = cudf.read_csv('./data/pop_sample.csv', dtype=['float32', 'float32', 'float32'])\n",
|
||||
"print(gdf.dtypes)\n",
|
||||
"gdf.shape"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>northing</th>\n",
|
||||
" <th>easting</th>\n",
|
||||
" <th>infected</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>178547.296875</td>\n",
|
||||
" <td>368012.1250</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>174068.281250</td>\n",
|
||||
" <td>543802.1250</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>358293.687500</td>\n",
|
||||
" <td>435639.8750</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>87240.304688</td>\n",
|
||||
" <td>389607.3750</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>158261.015625</td>\n",
|
||||
" <td>340764.9375</td>\n",
|
||||
" <td>0.0</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" northing easting infected\n",
|
||||
"0 178547.296875 368012.1250 0.0\n",
|
||||
"1 174068.281250 543802.1250 0.0\n",
|
||||
"2 358293.687500 435639.8750 0.0\n",
|
||||
"3 87240.304688 389607.3750 0.0\n",
|
||||
"4 158261.015625 340764.9375 0.0"
|
||||
]
|
||||
},
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"gdf.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"infected\n",
|
||||
"0.0 984331\n",
|
||||
"1.0 15669\n",
|
||||
"Name: count, dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"gdf['infected'].value_counts()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## DBSCAN Clustering ##\n",
|
||||
"DBSCAN is another unsupervised clustering algorithm that is particularly effective when the number of clusters is not known up front and the clusters may have concave or other unusual shapes--a situation that often applies in geospatial analytics.\n",
|
||||
"\n",
|
||||
"In this series of exercises you will use DBSCAN to identify clusters of infected people by location, which may help us identify groups becoming infected from common patient zeroes and assist in response planning.\n",
|
||||
"\n",
|
||||
"Create a DBSCAN instance by using `cuml.DBSCAN`. Pass in the named argument `eps` (the maximum distance a point can be from the nearest point in a cluster to be considered possibly in that cluster) to be `5000`. Since the `northing` and `easting` values we created are measured in meters, this will allow us to identify clusters of infected people where individuals may be separated from the rest of the cluster by up to 5 kilometers.\n",
|
||||
"\n",
|
||||
"Below we train a DBSCAN algorithm. We start by creating a new dataframe from rows of the original dataframe where `infected` is `1` (true), and call it `infected_df`--be sure to reset the dataframe's index afterward. Use `dbscan.fit_predict` to perform clustering on the `northing` and `easting` columns of `infected_df`, and turn the resulting series into a new column in `infected_gdf` called \"cluster\". Finally, compute the number of clusters identified by DBSCAN."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"96"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dbscan = cuml.DBSCAN(eps=5000)\n",
|
||||
"# dbscan = cuml.DBSCAN(eps=10000)\n",
|
||||
"\n",
|
||||
"infected_df = gdf[gdf['infected'] == 1].reset_index()\n",
|
||||
"infected_df['cluster'] = dbscan.fit_predict(infected_df[['northing', 'easting']])\n",
|
||||
"infected_df['cluster'].nunique()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Exercise #1 - Make Another DBSCAN Instance ###\n",
|
||||
"\n",
|
||||
"**Instructions**: <br>\n",
|
||||
"* Modify the `<FIXME>` only and execute the below cell to instantiate a DBSCAN instance with `10000` for `eps`.\n",
|
||||
"* Modify the `<FIXME>` only and execute the cell below to fit the data and identify infected clusters. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"dbscan = cuml.DBSCAN(eps=10000)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"10"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"infected_df = gdf[gdf['infected'] == 1].reset_index()\n",
|
||||
"infected_df['cluster'] = dbscan.fit_predict(infected_df[['northing', 'easting']])\n",
|
||||
"infected_df['cluster'].nunique()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "raw",
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": true
|
||||
}
|
||||
},
|
||||
"source": [
|
||||
"\n",
|
||||
"dbscan = cuml.DBSCAN(eps=10000)\n",
|
||||
"\n",
|
||||
"infected_df = gdf[gdf['infected'] == 1].reset_index()\n",
|
||||
"infected_df['cluster'] = dbscan.fit_predict(infected_df[['northing', 'easting']])\n",
|
||||
"infected_df['cluster'].nunique()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Click ... for solution. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Visualize the Clusters"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Because we have the same column names as in the K-means example--`easting`, `northing`, and `cluster`--we can use the same code to visualize the clusters."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"infected_df.to_pandas().plot(kind='scatter', x='easting', y='northing', c='cluster')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"app = IPython.Application.instance()\n",
|
||||
"app.kernel.do_shutdown(True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Well Done!** Let's move to the [next notebook](3-04_logistic_regression.ipynb). "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"./images/DLI_Header.png\" width=400/>"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
1463
5/data science/2/3-04_logistic_regression.ipynb
Normal file
1152
5/data science/2/3-05_knn.ipynb
Normal file
810
5/data science/2/3-06_xgboost.ipynb
Normal file
1050
5/data science/2/3-07_triton.ipynb
Normal file
673
5/data science/2/3-08_k-means_dask.ipynb
Normal file
@ -0,0 +1,673 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"./images/DLI_Header.png\" width=400/>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Fundamentals of Accelerated Data Science # "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 08 - Multi-GPU K-Means with Dask ##\n",
|
||||
"\n",
|
||||
"**Table of Contents**\n",
|
||||
"<br>\n",
|
||||
"This notebook uses GPU-accelerated K-means to identify population clusters in a multi-node, multi-GPU scalable way with Dask. This notebook covers the below sections: \n",
|
||||
"1. [Environment](#Environment)\n",
|
||||
"2. [Load and Persist Data](#Load-and-Persist-Data)\n",
|
||||
"3. [Training the Model](#Training-the-Model)\n",
|
||||
" * [Exercise #1 - Count Members of the Southernmost Cluster](#Exercise-#1---Count-Members-of-the-Southernmost-Cluster)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Environment ##\n",
|
||||
"First we import the needed modules to create a Dask cuDF cluster. As we did before, we need to import CUDA context creators after setting up the cluster so they don't lock to a single device. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import subprocess\n",
|
||||
"import logging\n",
|
||||
"\n",
|
||||
"from dask.distributed import Client, wait, progress\n",
|
||||
"from dask_cuda import LocalCUDACluster"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import cudf\n",
|
||||
"import dask_cudf\n",
|
||||
"\n",
|
||||
"import cuml\n",
|
||||
"from cuml.dask.cluster import KMeans"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# create cluster\n",
|
||||
"cmd = \"hostname --all-ip-addresses\"\n",
|
||||
"process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)\n",
|
||||
"output, error = process.communicate()\n",
|
||||
"IPADDR = str(output.decode()).split()[0]\n",
|
||||
"\n",
|
||||
"cluster = LocalCUDACluster(ip=IPADDR, silence_logs=logging.ERROR)\n",
|
||||
"client = Client(cluster)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load and Persist Data ##\n",
|
||||
"We will begin by loading the data, The data set has the two grid coordinate columns, `easting` and `northing`, derived from the main population data set we have prepared."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"ddf = dask_cudf.read_csv('./data/uk_pop5x_coords.csv', dtype=['float32', 'float32'])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Training the Model ##\n",
|
||||
"Training the K-means model is very similar to both the scikit-learn version and the cuML single-GPU version--by setting up the client and importing from the `cuml.dask.cluster` module, the algorithm will automatically use the local Dask cluster we have set up.\n",
|
||||
"\n",
|
||||
"Note that calling `.fit` triggers Dask computation.\n",
|
||||
"\n",
|
||||
"Once we have the fit model, we extract the cluster centers and rename the columns from their generic `0` and `1` to reflect the data on which they were trained."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"CPU times: user 5.24 s, sys: 2.48 s, total: 7.72 s\n",
|
||||
"Wall time: 1min 54s\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<style>#sk-container-id-1 {\n",
|
||||
" /* Definition of color scheme common for light and dark mode */\n",
|
||||
" --sklearn-color-text: black;\n",
|
||||
" --sklearn-color-line: gray;\n",
|
||||
" /* Definition of color scheme for unfitted estimators */\n",
|
||||
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
|
||||
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
|
||||
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
|
||||
" --sklearn-color-unfitted-level-3: chocolate;\n",
|
||||
" /* Definition of color scheme for fitted estimators */\n",
|
||||
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
|
||||
" --sklearn-color-fitted-level-1: #d4ebff;\n",
|
||||
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
|
||||
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
|
||||
"\n",
|
||||
" /* Specific color for light theme */\n",
|
||||
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
|
||||
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
|
||||
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
|
||||
" --sklearn-color-icon: #696969;\n",
|
||||
"\n",
|
||||
" @media (prefers-color-scheme: dark) {\n",
|
||||
" /* Redefinition of color scheme for dark theme */\n",
|
||||
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
|
||||
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
|
||||
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
|
||||
" --sklearn-color-icon: #878787;\n",
|
||||
" }\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 {\n",
|
||||
" color: var(--sklearn-color-text);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 pre {\n",
|
||||
" padding: 0;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 input.sk-hidden--visually {\n",
|
||||
" border: 0;\n",
|
||||
" clip: rect(1px 1px 1px 1px);\n",
|
||||
" clip: rect(1px, 1px, 1px, 1px);\n",
|
||||
" height: 1px;\n",
|
||||
" margin: -1px;\n",
|
||||
" overflow: hidden;\n",
|
||||
" padding: 0;\n",
|
||||
" position: absolute;\n",
|
||||
" width: 1px;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-dashed-wrapped {\n",
|
||||
" border: 1px dashed var(--sklearn-color-line);\n",
|
||||
" margin: 0 0.4em 0.5em 0.4em;\n",
|
||||
" box-sizing: border-box;\n",
|
||||
" padding-bottom: 0.4em;\n",
|
||||
" background-color: var(--sklearn-color-background);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-container {\n",
|
||||
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
|
||||
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
|
||||
" so we also need the `!important` here to be able to override the\n",
|
||||
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
|
||||
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
|
||||
" display: inline-block !important;\n",
|
||||
" position: relative;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-text-repr-fallback {\n",
|
||||
" display: none;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"div.sk-parallel-item,\n",
|
||||
"div.sk-serial,\n",
|
||||
"div.sk-item {\n",
|
||||
" /* draw centered vertical line to link estimators */\n",
|
||||
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
|
||||
" background-size: 2px 100%;\n",
|
||||
" background-repeat: no-repeat;\n",
|
||||
" background-position: center center;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Parallel-specific style estimator block */\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-parallel-item::after {\n",
|
||||
" content: \"\";\n",
|
||||
" width: 100%;\n",
|
||||
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
|
||||
" flex-grow: 1;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-parallel {\n",
|
||||
" display: flex;\n",
|
||||
" align-items: stretch;\n",
|
||||
" justify-content: center;\n",
|
||||
" background-color: var(--sklearn-color-background);\n",
|
||||
" position: relative;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-parallel-item {\n",
|
||||
" display: flex;\n",
|
||||
" flex-direction: column;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
|
||||
" align-self: flex-end;\n",
|
||||
" width: 50%;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
|
||||
" align-self: flex-start;\n",
|
||||
" width: 50%;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
|
||||
" width: 0;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Serial-specific style estimator block */\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-serial {\n",
|
||||
" display: flex;\n",
|
||||
" flex-direction: column;\n",
|
||||
" align-items: center;\n",
|
||||
" background-color: var(--sklearn-color-background);\n",
|
||||
" padding-right: 1em;\n",
|
||||
" padding-left: 1em;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"\n",
|
||||
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
|
||||
"clickable and can be expanded/collapsed.\n",
|
||||
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
|
||||
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
|
||||
"*/\n",
|
||||
"\n",
|
||||
"/* Pipeline and ColumnTransformer style (default) */\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-toggleable {\n",
|
||||
" /* Default theme specific background. It is overwritten whether we have a\n",
|
||||
" specific estimator or a Pipeline/ColumnTransformer */\n",
|
||||
" background-color: var(--sklearn-color-background);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Toggleable label */\n",
|
||||
"#sk-container-id-1 label.sk-toggleable__label {\n",
|
||||
" cursor: pointer;\n",
|
||||
" display: block;\n",
|
||||
" width: 100%;\n",
|
||||
" margin-bottom: 0;\n",
|
||||
" padding: 0.5em;\n",
|
||||
" box-sizing: border-box;\n",
|
||||
" text-align: center;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
|
||||
" /* Arrow on the left of the label */\n",
|
||||
" content: \"▸\";\n",
|
||||
" float: left;\n",
|
||||
" margin-right: 0.25em;\n",
|
||||
" color: var(--sklearn-color-icon);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
|
||||
" color: var(--sklearn-color-text);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Toggleable content - dropdown */\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-toggleable__content {\n",
|
||||
" max-height: 0;\n",
|
||||
" max-width: 0;\n",
|
||||
" overflow: hidden;\n",
|
||||
" text-align: left;\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-0);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
|
||||
" /* fitted */\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-0);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-toggleable__content pre {\n",
|
||||
" margin: 0.2em;\n",
|
||||
" border-radius: 0.25em;\n",
|
||||
" color: var(--sklearn-color-text);\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-0);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-0);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
|
||||
" /* Expand drop-down */\n",
|
||||
" max-height: 200px;\n",
|
||||
" max-width: 100%;\n",
|
||||
" overflow: auto;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
|
||||
" content: \"▾\";\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Pipeline/ColumnTransformer-specific style */\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
|
||||
" color: var(--sklearn-color-text);\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Estimator-specific style */\n",
|
||||
"\n",
|
||||
"/* Colorize estimator box */\n",
|
||||
"#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
|
||||
" /* fitted */\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
|
||||
"#sk-container-id-1 div.sk-label label {\n",
|
||||
" /* The background is the default theme color */\n",
|
||||
" color: var(--sklearn-color-text-on-default-background);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* On hover, darken the color of the background */\n",
|
||||
"#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
|
||||
" color: var(--sklearn-color-text);\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Label box, darken color on hover, fitted */\n",
|
||||
"#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
|
||||
" color: var(--sklearn-color-text);\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Estimator label */\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-label label {\n",
|
||||
" font-family: monospace;\n",
|
||||
" font-weight: bold;\n",
|
||||
" display: inline-block;\n",
|
||||
" line-height: 1.2em;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-label-container {\n",
|
||||
" text-align: center;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Estimator-specific */\n",
|
||||
"#sk-container-id-1 div.sk-estimator {\n",
|
||||
" font-family: monospace;\n",
|
||||
" border: 1px dotted var(--sklearn-color-border-box);\n",
|
||||
" border-radius: 0.25em;\n",
|
||||
" box-sizing: border-box;\n",
|
||||
" margin-bottom: 0.5em;\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-0);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-estimator.fitted {\n",
|
||||
" /* fitted */\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-0);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* on hover */\n",
|
||||
"#sk-container-id-1 div.sk-estimator:hover {\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
|
||||
" /* fitted */\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-2);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
|
||||
"\n",
|
||||
"/* Common style for \"i\" and \"?\" */\n",
|
||||
"\n",
|
||||
".sk-estimator-doc-link,\n",
|
||||
"a:link.sk-estimator-doc-link,\n",
|
||||
"a:visited.sk-estimator-doc-link {\n",
|
||||
" float: right;\n",
|
||||
" font-size: smaller;\n",
|
||||
" line-height: 1em;\n",
|
||||
" font-family: monospace;\n",
|
||||
" background-color: var(--sklearn-color-background);\n",
|
||||
" border-radius: 1em;\n",
|
||||
" height: 1em;\n",
|
||||
" width: 1em;\n",
|
||||
" text-decoration: none !important;\n",
|
||||
" margin-left: 1ex;\n",
|
||||
" /* unfitted */\n",
|
||||
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
|
||||
" color: var(--sklearn-color-unfitted-level-1);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
".sk-estimator-doc-link.fitted,\n",
|
||||
"a:link.sk-estimator-doc-link.fitted,\n",
|
||||
"a:visited.sk-estimator-doc-link.fitted {\n",
|
||||
" /* fitted */\n",
|
||||
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
|
||||
" color: var(--sklearn-color-fitted-level-1);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* On hover */\n",
|
||||
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
|
||||
".sk-estimator-doc-link:hover,\n",
|
||||
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
|
||||
".sk-estimator-doc-link:hover {\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-3);\n",
|
||||
" color: var(--sklearn-color-background);\n",
|
||||
" text-decoration: none;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
|
||||
".sk-estimator-doc-link.fitted:hover,\n",
|
||||
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
|
||||
".sk-estimator-doc-link.fitted:hover {\n",
|
||||
" /* fitted */\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-3);\n",
|
||||
" color: var(--sklearn-color-background);\n",
|
||||
" text-decoration: none;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* Span, style for the box shown on hovering the info icon */\n",
|
||||
".sk-estimator-doc-link span {\n",
|
||||
" display: none;\n",
|
||||
" z-index: 9999;\n",
|
||||
" position: relative;\n",
|
||||
" font-weight: normal;\n",
|
||||
" right: .2ex;\n",
|
||||
" padding: .5ex;\n",
|
||||
" margin: .5ex;\n",
|
||||
" width: min-content;\n",
|
||||
" min-width: 20ex;\n",
|
||||
" max-width: 50ex;\n",
|
||||
" color: var(--sklearn-color-text);\n",
|
||||
" box-shadow: 2pt 2pt 4pt #999;\n",
|
||||
" /* unfitted */\n",
|
||||
" background: var(--sklearn-color-unfitted-level-0);\n",
|
||||
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
".sk-estimator-doc-link.fitted span {\n",
|
||||
" /* fitted */\n",
|
||||
" background: var(--sklearn-color-fitted-level-0);\n",
|
||||
" border: var(--sklearn-color-fitted-level-3);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
".sk-estimator-doc-link:hover span {\n",
|
||||
" display: block;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 a.estimator_doc_link {\n",
|
||||
" float: right;\n",
|
||||
" font-size: 1rem;\n",
|
||||
" line-height: 1em;\n",
|
||||
" font-family: monospace;\n",
|
||||
" background-color: var(--sklearn-color-background);\n",
|
||||
" border-radius: 1rem;\n",
|
||||
" height: 1rem;\n",
|
||||
" width: 1rem;\n",
|
||||
" text-decoration: none;\n",
|
||||
" /* unfitted */\n",
|
||||
" color: var(--sklearn-color-unfitted-level-1);\n",
|
||||
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 a.estimator_doc_link.fitted {\n",
|
||||
" /* fitted */\n",
|
||||
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
|
||||
" color: var(--sklearn-color-fitted-level-1);\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"/* On hover */\n",
|
||||
"#sk-container-id-1 a.estimator_doc_link:hover {\n",
|
||||
" /* unfitted */\n",
|
||||
" background-color: var(--sklearn-color-unfitted-level-3);\n",
|
||||
" color: var(--sklearn-color-background);\n",
|
||||
" text-decoration: none;\n",
|
||||
"}\n",
|
||||
"\n",
|
||||
"#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
|
||||
" /* fitted */\n",
|
||||
" background-color: var(--sklearn-color-fitted-level-3);\n",
|
||||
"}\n",
|
||||
"</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>KMeansMG()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\"> KMeansMG<span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>KMeansMG()</pre></div> </div></div></div></div>"
|
||||
],
|
||||
"text/plain": [
|
||||
"KMeansMG()"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"dkm = KMeans(n_clusters=20)\n",
|
||||
"dkm.fit(ddf)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"northing float32\n",
|
||||
"easting float32\n",
|
||||
"dtype: object"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cluster_centers = dkm.cluster_centers_\n",
|
||||
"cluster_centers.columns = ddf.columns\n",
|
||||
"cluster_centers.dtypes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"### Exercise #1 - Count Members of the Southernmost Cluster ###\n",
|
||||
"Using the `cluster_centers`, identify which cluster is the southernmost (has the lowest `northing` value) with the `nsmallest` method, then use `dkm.predict` to get labels for the data, and finally filter the labels to determine how many individuals the model estimated were in that cluster. \n",
|
||||
"\n",
|
||||
"**Instructions**: <br>\n",
|
||||
"* Modify the `<FIXME>` only and execute the below cell to estimate the number of individuals in the southernmost cluster. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"31435157"
|
||||
]
|
||||
},
|
||||
"execution_count": 7,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"south_idx = cluster_centers.nsmallest(1, 'northing').index[0]\n",
|
||||
"labels_predicted = dkm.predict(ddf)\n",
|
||||
"labels_predicted[labels_predicted==south_idx].compute().shape[0]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'status': 'ok', 'restart': True}"
|
||||
]
|
||||
},
|
||||
"execution_count": 8,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"154087362\n",
|
||||
"144014032\n",
|
||||
"131789736\n",
|
||||
"154907810\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"app = IPython.Application.instance()\n",
|
||||
"app.kernel.do_shutdown(True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Well Done!**"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<img src=\"./images/DLI_Header.png\" width=400/>"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
818
5/data science/2/4-02_find_infected.ipynb
Normal file
@ -0,0 +1,818 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a href=\"https://www.nvidia.com/dli\"><img src=\"images/DLI_Header.png\" alt=\"Header\" style=\"width: 400px;\"/></a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"# Week 1: Find Clusters of Infected People"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<span style=\"color:red\">\n",
|
||||
"**URGENT WARNING**\n",
|
||||
"\n",
|
||||
"We have been receiving reports from health facilities that a new, fast-spreading virus has been discovered in the population. To prepare our response, we need to understand the geospatial distribution of those who have been infected. Find out whether there are identifiable clusters of infected individuals and where they are. \n",
|
||||
"</span>\n",
|
||||
"\n",
|
||||
"Your goal for this notebook will be to estimate the location of dense geographic clusters of infected people using incoming data from week 1 of the simulated epidemic."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Imports"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 65,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"The cudf.pandas extension is already loaded. To reload it, use:\n",
|
||||
" %reload_ext cudf.pandas\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%load_ext cudf.pandas\n",
|
||||
"import pandas as pd\n",
|
||||
"import cuml\n",
|
||||
"\n",
|
||||
"import cupy as cp"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Load Data"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Begin by loading the data you've received about week 1 of the outbreak into a cuDF-accelerated pandas DataFrame. The data is located at `'./data/week1.csv'`. For this notebook you will only need the `'lat'`, `'long'`, and `'infected'` columns. Either drop the columns after loading, or use the `pd.read_csv` named argument `usecols` to provide a list of only the columns you need."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 66,
|
||||
"metadata": {
|
||||
"scrolled": true
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>lat</th>\n",
|
||||
" <th>long</th>\n",
|
||||
" <th>infected</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>54.522511</td>\n",
|
||||
" <td>-1.571896</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>54.554031</td>\n",
|
||||
" <td>-1.524968</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>54.552483</td>\n",
|
||||
" <td>-1.435203</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>54.537186</td>\n",
|
||||
" <td>-1.566215</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>54.528210</td>\n",
|
||||
" <td>-1.588462</td>\n",
|
||||
" <td>False</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" lat long infected\n",
|
||||
"0 54.522511 -1.571896 False\n",
|
||||
"1 54.554031 -1.524968 False\n",
|
||||
"2 54.552483 -1.435203 False\n",
|
||||
"3 54.537186 -1.566215 False\n",
|
||||
"4 54.528210 -1.588462 False"
|
||||
]
|
||||
},
|
||||
"execution_count": 66,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"df = pd.read_csv('./data/week1.csv', dtype = {\n",
|
||||
" 'lat': 'float32',\n",
|
||||
" 'long': 'float32',\n",
|
||||
" 'infected': 'category',\n",
|
||||
"}, usecols = ['lat', 'long', 'infected'])\n",
|
||||
"df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Make Data Frame of the Infected"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Make a new DataFrame `infected_df` that contains only the infected members of the population.\n",
|
||||
"\n",
|
||||
"**Tip**: Reset the index of `infected_df` with `.reset_index(drop=True)`. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"[28928759 28930512 28930904 ... 57410428 57411005 57411919]\n",
|
||||
"[ 0 1 2 ... 18145 18146 18147]\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>lat</th>\n",
|
||||
" <th>long</th>\n",
|
||||
" <th>infected</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>54.472767</td>\n",
|
||||
" <td>-1.654932</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>54.529720</td>\n",
|
||||
" <td>-1.667143</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>54.512981</td>\n",
|
||||
" <td>-1.589866</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>54.522320</td>\n",
|
||||
" <td>-1.380694</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>54.541656</td>\n",
|
||||
" <td>-1.613490</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" lat long infected\n",
|
||||
"0 54.472767 -1.654932 True\n",
|
||||
"1 54.529720 -1.667143 True\n",
|
||||
"2 54.512981 -1.589866 True\n",
|
||||
"3 54.522320 -1.380694 True\n",
|
||||
"4 54.541656 -1.613490 True"
|
||||
]
|
||||
},
|
||||
"execution_count": 13,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"infected_df = df[df['infected'] == True]\n",
|
||||
"print(infected_df.index.values)\n",
|
||||
"\n",
|
||||
"infected_df = infected_df.reset_index(drop=True)\n",
|
||||
"\n",
|
||||
"print(infected_df.index.values)\n",
|
||||
"infected_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Make Grid Coordinates for Infected Locations"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Provided for you in the next cell (which you can expand by clicking on the \"...\" and contract again after executing by clicking on the blue left border of the cell) is the lat/long to OSGB36 grid coordinates converter you used earlier in the workshop. Use this converter to create grid coordinate values stored in `northing` and `easting` columns of the `infected_df` you created in the last step."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 16,
|
||||
"metadata": {
|
||||
"jupyter": {
|
||||
"source_hidden": true
|
||||
}
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# https://www.ordnancesurvey.co.uk/docs/support/guide-coordinate-systems-great-britain.pdf\n",
|
||||
"\n",
|
||||
"def latlong2osgbgrid_cupy(lat, long, input_degrees=True):\n",
|
||||
" '''\n",
|
||||
" Converts latitude and longitude (ellipsoidal) coordinates into northing and easting (grid) coordinates, using a Transverse Mercator projection.\n",
|
||||
" \n",
|
||||
" Inputs:\n",
|
||||
" lat: latitude coordinate (N)\n",
|
||||
" long: longitude coordinate (E)\n",
|
||||
" input_degrees: if True (default), interprets the coordinates as degrees; otherwise, interprets coordinates as radians\n",
|
||||
" \n",
|
||||
" Output:\n",
|
||||
" (northing, easting)\n",
|
||||
" '''\n",
|
||||
" \n",
|
||||
" if input_degrees:\n",
|
||||
" lat = lat * cp.pi/180\n",
|
||||
" long = long * cp.pi/180\n",
|
||||
"\n",
|
||||
" a = 6377563.396\n",
|
||||
" b = 6356256.909\n",
|
||||
" e2 = (a**2 - b**2) / a**2\n",
|
||||
"\n",
|
||||
" N0 = -100000 # northing of true origin\n",
|
||||
" E0 = 400000 # easting of true origin\n",
|
||||
" F0 = .9996012717 # scale factor on central meridian\n",
|
||||
" phi0 = 49 * cp.pi / 180 # latitude of true origin\n",
|
||||
" lambda0 = -2 * cp.pi / 180 # longitude of true origin and central meridian\n",
|
||||
" \n",
|
||||
" sinlat = cp.sin(lat)\n",
|
||||
" coslat = cp.cos(lat)\n",
|
||||
" tanlat = cp.tan(lat)\n",
|
||||
" \n",
|
||||
" latdiff = lat-phi0\n",
|
||||
" longdiff = long-lambda0\n",
|
||||
"\n",
|
||||
" n = (a-b) / (a+b)\n",
|
||||
" nu = a * F0 * (1 - e2 * sinlat ** 2) ** -.5\n",
|
||||
" rho = a * F0 * (1 - e2) * (1 - e2 * sinlat ** 2) ** -1.5\n",
|
||||
" eta2 = nu / rho - 1\n",
|
||||
" M = b * F0 * ((1 + n + 5/4 * (n**2 + n**3)) * latdiff - \n",
|
||||
" (3*(n+n**2) + 21/8 * n**3) * cp.sin(latdiff) * cp.cos(lat+phi0) +\n",
|
||||
" 15/8 * (n**2 + n**3) * cp.sin(2*(latdiff)) * cp.cos(2*(lat+phi0)) - \n",
|
||||
" 35/24 * n**3 * cp.sin(3*(latdiff)) * cp.cos(3*(lat+phi0)))\n",
|
||||
" I = M + N0\n",
|
||||
" II = nu/2 * sinlat * coslat\n",
|
||||
" III = nu/24 * sinlat * coslat ** 3 * (5 - tanlat ** 2 + 9 * eta2)\n",
|
||||
" IIIA = nu/720 * sinlat * coslat ** 5 * (61-58 * tanlat**2 + tanlat**4)\n",
|
||||
" IV = nu * coslat\n",
|
||||
" V = nu / 6 * coslat**3 * (nu/rho - cp.tan(lat)**2)\n",
|
||||
" VI = nu / 120 * coslat ** 5 * (5 - 18 * tanlat**2 + tanlat**4 + 14 * eta2 - 58 * tanlat**2 * eta2)\n",
|
||||
"\n",
|
||||
" northing = I + II * longdiff**2 + III * longdiff**4 + IIIA * longdiff**6\n",
|
||||
" easting = E0 + IV * longdiff + V * longdiff**3 + VI * longdiff**5\n",
|
||||
"\n",
|
||||
" return(northing, easting)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 69,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>lat</th>\n",
|
||||
" <th>long</th>\n",
|
||||
" <th>infected</th>\n",
|
||||
" <th>northing</th>\n",
|
||||
" <th>easting</th>\n",
|
||||
" <th>cluster</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>54.472767</td>\n",
|
||||
" <td>-1.654932</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>508670.609809</td>\n",
|
||||
" <td>422359.747233</td>\n",
|
||||
" <td>-1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>54.529720</td>\n",
|
||||
" <td>-1.667143</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>515003.452959</td>\n",
|
||||
" <td>421538.534748</td>\n",
|
||||
" <td>-1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>54.512981</td>\n",
|
||||
" <td>-1.589866</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>513167.311551</td>\n",
|
||||
" <td>426549.871569</td>\n",
|
||||
" <td>-1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>54.522320</td>\n",
|
||||
" <td>-1.380694</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>514305.528712</td>\n",
|
||||
" <td>440081.234190</td>\n",
|
||||
" <td>-1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>54.541656</td>\n",
|
||||
" <td>-1.613490</td>\n",
|
||||
" <td>True</td>\n",
|
||||
" <td>516349.193146</td>\n",
|
||||
" <td>425002.998690</td>\n",
|
||||
" <td>-1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" lat long infected northing easting cluster\n",
|
||||
"0 54.472767 -1.654932 True 508670.609809 422359.747233 -1\n",
|
||||
"1 54.529720 -1.667143 True 515003.452959 421538.534748 -1\n",
|
||||
"2 54.512981 -1.589866 True 513167.311551 426549.871569 -1\n",
|
||||
"3 54.522320 -1.380694 True 514305.528712 440081.234190 -1\n",
|
||||
"4 54.541656 -1.613490 True 516349.193146 425002.998690 -1"
|
||||
]
|
||||
},
|
||||
"execution_count": 69,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"cupy_lat = cp.asarray(infected_df['lat'])\n",
|
||||
"cupy_long = cp.asarray(infected_df['long'])\n",
|
||||
"\n",
|
||||
"infected_df['northing'], infected_df['easting'] = latlong2osgbgrid_cupy(cupy_lat, cupy_long)\n",
|
||||
"infected_df.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Find Clusters of Infected People"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Use DBSCAN to find clusters of at least 25 infected people where no member is more than 2000m from at least one other cluster member. Create a new column in `infected_df` which contains the cluster to which each infected person belongs."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 70,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"<cudf.core.groupby.groupby.DataFrameGroupBy object at 0x7f55ea949240>"
|
||||
]
|
||||
},
|
||||
"execution_count": 70,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"dbscan = cuml.DBSCAN(eps = 2000, min_samples = 25)\n",
|
||||
"infected_df['cluster'] = dbscan.fit_predict(infected_df[['northing', 'easting']])\n",
|
||||
"infected_df.groupby('cluster')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Find the Centroid of Each Cluster"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Use grouping to find the mean `northing` and `easting` values for each cluster identified above."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 71,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>northing</th>\n",
|
||||
" <th>easting</th>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>cluster</th>\n",
|
||||
" <th></th>\n",
|
||||
" <th></th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>-1</th>\n",
|
||||
" <td>378094.622647</td>\n",
|
||||
" <td>401880.682473</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>397661.319575</td>\n",
|
||||
" <td>371410.021738</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>436475.527827</td>\n",
|
||||
" <td>332980.449214</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>347062.477357</td>\n",
|
||||
" <td>389386.823243</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>359668.552556</td>\n",
|
||||
" <td>379638.020362</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>391630.403390</td>\n",
|
||||
" <td>431158.137254</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>5</th>\n",
|
||||
" <td>386471.397432</td>\n",
|
||||
" <td>426559.085587</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>6</th>\n",
|
||||
" <td>434970.462486</td>\n",
|
||||
" <td>406985.278520</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>7</th>\n",
|
||||
" <td>412772.652344</td>\n",
|
||||
" <td>410069.663793</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>8</th>\n",
|
||||
" <td>415808.971615</td>\n",
|
||||
" <td>414713.750256</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>9</th>\n",
|
||||
" <td>417322.530166</td>\n",
|
||||
" <td>409583.737652</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>10</th>\n",
|
||||
" <td>334208.471668</td>\n",
|
||||
" <td>435937.777721</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>11</th>\n",
|
||||
" <td>300568.023792</td>\n",
|
||||
" <td>391901.514790</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>12</th>\n",
|
||||
" <td>291539.540205</td>\n",
|
||||
" <td>401640.663845</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>13</th>\n",
|
||||
" <td>289855.069902</td>\n",
|
||||
" <td>394518.295606</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" northing easting\n",
|
||||
"cluster \n",
|
||||
"-1 378094.622647 401880.682473\n",
|
||||
" 0 397661.319575 371410.021738\n",
|
||||
" 1 436475.527827 332980.449214\n",
|
||||
" 2 347062.477357 389386.823243\n",
|
||||
" 3 359668.552556 379638.020362\n",
|
||||
" 4 391630.403390 431158.137254\n",
|
||||
" 5 386471.397432 426559.085587\n",
|
||||
" 6 434970.462486 406985.278520\n",
|
||||
" 7 412772.652344 410069.663793\n",
|
||||
" 8 415808.971615 414713.750256\n",
|
||||
" 9 417322.530166 409583.737652\n",
|
||||
" 10 334208.471668 435937.777721\n",
|
||||
" 11 300568.023792 391901.514790\n",
|
||||
" 12 291539.540205 401640.663845\n",
|
||||
" 13 289855.069902 394518.295606"
|
||||
]
|
||||
},
|
||||
"execution_count": 71,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"centroids_df = infected_df[['northing', 'easting', 'cluster']].groupby('cluster').mean()\n",
|
||||
"centroids_df"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Find the number of people in each cluster by counting the number of appearances of each cluster's label in the column produced by DBSCAN."
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 72,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"cluster\n",
|
||||
"-1 8451\n",
|
||||
" 0 8638\n",
|
||||
" 1 68\n",
|
||||
" 2 403\n",
|
||||
" 3 25\n",
|
||||
" 4 66\n",
|
||||
" 5 43\n",
|
||||
" 6 27\n",
|
||||
" 7 39\n",
|
||||
" 8 92\n",
|
||||
" 9 21\n",
|
||||
" 10 64\n",
|
||||
" 11 68\n",
|
||||
" 12 72\n",
|
||||
" 13 71\n",
|
||||
"dtype: int64"
|
||||
]
|
||||
},
|
||||
"execution_count": 72,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"infected_df.groupby(['cluster']).size()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Find the Centroid of the Cluster with the Most Members ##"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Use the cluster label for with the most people to filter `centroid_df` and write the answer to `my_assessment/question_1.json`. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 78,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"/opt/conda/lib/python3.10/site-packages/cudf/io/json.py:194: UserWarning: Using CPU via Pandas to write JSON dataset\n",
|
||||
" warnings.warn(\"Using CPU via Pandas to write JSON dataset\")\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"centroids_df.loc[0].to_json('my_assessment/question_1.json')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## Check Submission ##"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 79,
|
||||
"metadata": {},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"{\"northing\":397661.3195752321,\"easting\":371410.0217381102}"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"!cat my_assessment/question_1.json"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"**Tip**: Your submission file should contain one line of text, similar to: \n",
|
||||
"\n",
|
||||
"```\n",
|
||||
"{'northing':XXX.XX,'easting':XXX.XX}\n",
|
||||
"```"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<div align=\"center\"><h2>Please Restart the Kernel</h2></div>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"import IPython\n",
|
||||
"app = IPython.Application.instance()\n",
|
||||
"app.kernel.do_shutdown(True)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a href=\"https://www.nvidia.com/dli\"><img src=\"images/DLI_Header.png\" alt=\"Header\" style=\"width: 400px;\"/></a>"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.10.15"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 4
|
||||
}
|
||||
3718
5/data science/2/4-03_nearest_facilities.ipynb
Normal file
1212
5/data science/2/4-04_identify_risk_factors.ipynb
Normal file
172
5/data science/2/county_centroid.csv
Normal file
@ -0,0 +1,172 @@
|
||||
county,lat_county_center,long_county_center
|
||||
BARKING AND DAGENHAM,51.621048311776526,0.12958319845588165
|
||||
BARNET,51.81255163972051,-0.21821206632197684
|
||||
BARNSLEY,53.57190690010971,-1.5487193565226611
|
||||
BATH AND NORTH EAST SOMERSET,51.35496548780361,-2.486675162410336
|
||||
BEDFORD,52.145475839485385,-0.4549734374180617
|
||||
BEXLEY,51.33625605642689,0.14633321710015448
|
||||
BIRMINGHAM,52.12178304394528,-1.881329432771379
|
||||
BLACKBURN WITH DARWEN,53.63718763008419,-2.463700844959783
|
||||
BLACKPOOL,53.882118373353435,-3.0229009637127167
|
||||
BLAENAU GWENT,51.75159582861159,-3.1862426125686745
|
||||
BOLTON,53.73813128127497,-2.4794091133678147
|
||||
BRACKNELL FOREST,51.457925145468295,-0.7336441271286038
|
||||
BRADFORD,53.972113267048044,-1.8738762931122748
|
||||
BRENT,51.761695309784,-0.2756927203781798
|
||||
BRIDGEND,51.522888539164526,-3.6137468421270604
|
||||
BRIGHTON AND HOVE,50.94890407892698,-0.1507807253912774
|
||||
"BRISTOL, CITY OF",51.53203785026057,-2.5774864859032594
|
||||
BROMLEY,51.2251371203518,0.03905163114984023
|
||||
BUCKINGHAMSHIRE,51.92925587759856,-0.8053996183750294
|
||||
BURY,53.61553432785575,-2.3088650595977023
|
||||
CAERPHILLY,51.62781255006381,-3.1973649865483735
|
||||
CALDERDALE,53.769761331289686,-1.9616103771384508
|
||||
CAMBRIDGESHIRE,52.1333820427886,-0.23503728806014595
|
||||
CAMDEN,51.69346289078886,-0.1629412552292679
|
||||
CARDIFF,51.56635588939404,-3.222317281083218
|
||||
CARMARTHENSHIRE,51.92106862577838,-4.211293704149962
|
||||
CENTRAL BEDFORDSHIRE,51.99983427713095,-0.4775810785914261
|
||||
CEREDIGION,52.297905934896974,-3.9524382809074967
|
||||
CHESHIRE EAST,53.209779668583735,-2.2923524120906538
|
||||
CHESHIRE WEST AND CHESTER,53.12468649229667,-2.703640874356098
|
||||
CITY OF LONDON,51.515869084539396,-0.09345024349003202
|
||||
CONWY,53.125451225027945,-3.7469275629154897
|
||||
CORNWALL,50.2491094902892,-4.642072961722217
|
||||
COUNTY DURHAM,54.46928915708376,-1.840983172985692
|
||||
COVENTRY,52.20619163815314,-1.5190329484575433
|
||||
CROYDON,51.33122440611814,-0.07773715861848832
|
||||
CUMBRIA,54.470582575648244,-2.902600383252353
|
||||
DARLINGTON,54.51355967194039,-1.5680201999230523
|
||||
DENBIGHSHIRE,53.07313542431554,-3.347662396412462
|
||||
DERBY,52.98317870391253,-1.471762916352353
|
||||
DERBYSHIRE,52.96237103431297,-1.6019383162802616
|
||||
DEVON,50.75993290464059,-3.6572707805745353
|
||||
DONCASTER,53.579077870304175,-1.1091519021581622
|
||||
DORSET,50.80117614559981,-2.4141088997141975
|
||||
DUDLEY,52.466075739334926,-2.101688961593882
|
||||
EALING,51.69946371446451,-0.31413253292570953
|
||||
EAST RIDING OF YORKSHIRE,53.9506321883079,-0.6619808168243948
|
||||
EAST SUSSEX,50.8319515317622,0.33441692286193403
|
||||
ENFIELD,51.79829813489722,-0.08133941451400101
|
||||
ESSEX,51.61177562858481,0.5408806396014519
|
||||
FLINTSHIRE,53.18448452051185,-3.176529270275655
|
||||
GATESHEAD,54.984104331680726,-1.6867966327256207
|
||||
GLOUCESTERSHIRE,51.95116469210396,-2.152140175011601
|
||||
GREENWICH,51.298529627584855,0.05009798110429057
|
||||
GWYNEDD,52.90798692199907,-3.815807248465912
|
||||
HACKNEY,51.715573990309835,-0.06047668080560671
|
||||
HALTON,53.37945371869939,-2.6885285111965866
|
||||
HAMMERSMITH AND FULHAM,51.45669431471315,-0.21734862391196488
|
||||
HAMPSHIRE,51.35882747857323,-1.2472236572124424
|
||||
HARINGEY,51.71488485869694,-0.10670896820865851
|
||||
HARROW,51.69502976226169,-0.3360141730528605
|
||||
HARTLEPOOL,54.67019690697325,-1.2702881849113061
|
||||
HAVERING,51.68803382335829,0.23538931286606415
|
||||
"HEREFORDSHIRE, COUNTY OF",52.05661428266539,-2.7394973894756567
|
||||
HERTFORDSHIRE,51.97545351306396,-0.2768104374496038
|
||||
HILLINGDON,51.67744993832507,-0.44168376669816023
|
||||
HOUNSLOW,51.31550103034914,-0.37851470463324743
|
||||
ISLE OF ANGLESEY,53.27637540915653,-4.323495411729392
|
||||
ISLE OF WIGHT,50.62684579406237,-1.3335589426514434
|
||||
ISLES OF SCILLY,49.923857744201605,-6.302263516809768
|
||||
ISLINGTON,51.66454658738323,-0.10992970115558956
|
||||
KENSINGTON AND CHELSEA,51.49977592399342,-0.18981078381787103
|
||||
KENT,51.066980402556894,0.72177006521006
|
||||
"KINGSTON UPON HULL, CITY OF",53.894135701816644,-0.30380941990063115
|
||||
KINGSTON UPON THAMES,51.42789080754545,-0.28368404321251495
|
||||
KIRKLEES,53.84779145117579,-1.7808194218728275
|
||||
KNOWSLEY,53.48284092504563,-2.8329791954991275
|
||||
LAMBETH,51.252923290285565,-0.11380231585035454
|
||||
LANCASHIRE,53.39410422518683,-2.460896340904076
|
||||
LEEDS,53.55494339794778,-1.5074406609781625
|
||||
LEICESTER,52.7035904712036,-1.1304165681356237
|
||||
LEICESTERSHIRE,52.372384242153444,-1.3774821236258858
|
||||
LEWISHAM,51.26146486742923,-0.017302263531446847
|
||||
LINCOLNSHIRE,53.019325697607805,-0.23840017404638325
|
||||
LIVERPOOL,53.51161042331058,-2.9133522899513755
|
||||
LUTON,51.96794156247519,-0.4231450525783596
|
||||
MANCHESTER,53.618174414336764,-2.2337215842169944
|
||||
MEDWAY,51.32754494250598,0.5632336335498731
|
||||
MERTHYR TYDFIL,51.749169200604825,-3.36403864047987
|
||||
MERTON,51.37364806533906,-0.18868296177359278
|
||||
MIDDLESBROUGH,54.5098082464691,-1.211038279554591
|
||||
MILTON KEYNES,52.01693552290149,-0.7406232665194876
|
||||
MONMOUTHSHIRE,51.78143655329183,-2.9039386644643197
|
||||
NEATH PORT TALBOT,51.59538437854254,-3.7458617902677283
|
||||
NEWCASTLE UPON TYNE,55.00208530426788,-1.652806624671881
|
||||
NEWHAM,51.75154898367921,0.027418339450078835
|
||||
NEWPORT,51.53253056059282,-2.8977514562758477
|
||||
NORFOLK,52.3032223796034,0.9647662889518414
|
||||
NORTH EAST LINCOLNSHIRE,53.50967645052903,-0.13922750148994814
|
||||
NORTH LINCOLNSHIRE,53.57540769163687,-0.5237063875323392
|
||||
NORTH SOMERSET,51.35265217208383,-2.754333708085771
|
||||
NORTH TYNESIDE,55.00390319683472,-1.5092377782362794
|
||||
NORTH YORKSHIRE,54.037083506236726,-1.5496083229591298
|
||||
NORTHAMPTONSHIRE,52.090056204873584,-0.8673643733062965
|
||||
NORTHUMBERLAND,55.268382697315424,-2.075107564148198
|
||||
NOTTINGHAM,52.95517248670217,-1.166635297324727
|
||||
NOTTINGHAMSHIRE,53.03298887412134,-1.006945929298795
|
||||
OLDHAM,53.659965283524954,-2.052688245629671
|
||||
OXFORDSHIRE,51.93769526591072,-1.2911207463303098
|
||||
PEMBROKESHIRE,51.87232817560273,-4.908191395785854
|
||||
PETERBOROUGH,52.62511626981561,-0.2689975241368676
|
||||
PLYMOUTH,50.29446598251615,-4.112955625237552
|
||||
PORTSMOUTH,50.91433206435089,-1.0702659081823802
|
||||
POWYS,52.35028728472521,-3.4364646802117074
|
||||
READING,51.48972751726377,-0.9907195716377762
|
||||
REDBRIDGE,51.74619394585629,0.0701000048233879
|
||||
REDCAR AND CLEVELAND,54.52674848959172,-1.0057471172413288
|
||||
RICHMOND UPON THAMES,51.40228740909276,-0.28924251316631455
|
||||
ROCHDALE,53.67734692115036,-2.14815188340053
|
||||
ROTHERHAM,53.27571588878268,-1.2866084213986422
|
||||
RUTLAND,52.66741819281054,-0.6255844565552813
|
||||
SALFORD,53.39900474827836,-2.3848977331687684
|
||||
SANDWELL,52.58696674791831,-2.007627650605722
|
||||
SEFTON,53.41754419091054,-2.9918998460398845
|
||||
SHEFFIELD,53.594572416421464,-1.5427564265432459
|
||||
SHROPSHIRE,52.68421414164122,-2.7366875706426375
|
||||
SLOUGH,51.500375556628576,-0.5761037634462686
|
||||
SOLIHULL,52.36591301434561,-1.7157174664625492
|
||||
SOMERSET,51.15203995716832,-3.2953379430424437
|
||||
SOUTH GLOUCESTERSHIRE,51.619868102630875,-2.469430184260059
|
||||
SOUTH TYNESIDE,54.994706019365786,-1.4469508035803413
|
||||
SOUTHAMPTON,50.984805930473584,-1.4002768042215858
|
||||
SOUTHEND-ON-SEA,51.562157807336284,0.7069905953535786
|
||||
SOUTHWARK,51.26247572937943,-0.07306483663823536
|
||||
ST. HELENS,53.442240723358644,-2.7032424159534347
|
||||
STAFFORDSHIRE,52.54946704767607,-2.027491119365553
|
||||
STOCKPORT,53.243567817667724,-2.1248973952531918
|
||||
STOCKTON-ON-TEES,54.60356568786033,-1.3063893005278557
|
||||
STOKE-ON-TRENT,53.0018684063432,-2.1588155163720084
|
||||
SUFFOLK,52.07327606663186,1.049040133490474
|
||||
SUNDERLAND,54.95658521287448,-1.433572135990224
|
||||
SURREY,51.75817482314145,-0.3386369800762059
|
||||
SUTTON,51.33189096687447,-0.17228958486126392
|
||||
SWANSEA,51.734320352502984,-3.967180818043868
|
||||
SWINDON,51.64295753076632,-1.7336382187066433
|
||||
TAMESIDE,53.4185402114593,-2.0769462404028474
|
||||
TELFORD AND WREKIN,52.709149095326744,-2.4894724871905916
|
||||
THURROCK,51.508227793073466,0.33492786371540356
|
||||
TORBAY,50.494049197230815,-3.5551646045072913
|
||||
TORFAEN,51.69896506141925,-3.0509328418360218
|
||||
TOWER HAMLETS,51.68485859523772,-0.03638140322291906
|
||||
TRAFFORD,53.314621144815334,-2.3656560688750687
|
||||
VALE OF GLAMORGAN,51.477096810804674,-3.3980039155600954
|
||||
WAKEFIELD,53.81677380462442,-1.4208545508030999
|
||||
WALSALL,52.742742908764974,-1.9703315889024553
|
||||
WALTHAM FOREST,51.723501987712325,-0.01886180175957716
|
||||
WANDSWORTH,51.24653418036352,-0.2001743797936436
|
||||
WARRINGTON,53.338554119123636,-2.561564052456012
|
||||
WARWICKSHIRE,52.04847200574421,-1.5686356193411675
|
||||
WEST BERKSHIRE,51.472960442069805,-1.2740171035533379
|
||||
WEST SUSSEX,51.11473921001523,-0.4593527537340543
|
||||
WESTMINSTER,51.613346179755915,-0.15298252171750404
|
||||
WIGAN,53.58763891955546,-2.5723844100365545
|
||||
WILTSHIRE,51.48575283497703,-1.926537553406791
|
||||
WINDSOR AND MAIDENHEAD,51.494612540256846,-0.6753936432282348
|
||||
WIRRAL,53.237217504292545,-3.0650813262796417
|
||||
WOKINGHAM,51.45966460093226,-0.8993706058495408
|
||||
WOLVERHAMPTON,52.71684834050869,-2.127594624973283
|
||||
WORCESTERSHIRE,52.05799103802506,-2.209184250840713
|
||||
WREXHAM,53.00080440180421,-2.991958507191866
|
||||
YORK,53.99232942499273,-1.073788787620359
|
||||
|
155148
5/data science/2e/BoardingData.csv
Normal file
790
5/data science/2e/BoardingData.ipynb
Normal file
48060
5/data science/2e/worldcities.csv
Executable file
3675
5/data science/3/1_01_data_loading.ipynb
Normal file
2530339
5/data science/3/1_02_EDA.ipynb
Normal file
1907
5/data science/3/1_03_categorical_feature_engineering.ipynb
Normal file
1864
5/data science/3/1_04_nvtabular_and_mgpu.ipynb
Normal file
552
5/data science/3/2_01_classification_primer.ipynb
Normal file
2383
5/data science/3/2_02_accelerated_model_development.ipynb
Normal file
5930
5/data science/3/2_03_model_tuning.ipynb
Normal file
776
5/data science/3/2_04_embedding.ipynb
Normal file
@ -0,0 +1,776 @@
|
||||
{
|
||||
"cells": [
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "0bf7f930-76a1-4c16-84e4-cf1e73b54c55",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a href=\"https://www.nvidia.com/dli\"> <img src=\"images/DLI_Header.png\" alt=\"Header\" style=\"width: 400px;\"/> </a>"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "400a41da-bc38-4e9a-9ece-d2744ffb16b0",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"# Enhancing Data Science Outcomes With Efficient Workflow #"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "8897c66c-4f9d-48b4-a60b-ddae16f2f61b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"## 04 - Embeddings ##\n",
|
||||
"In this lab, you will use high-performance computing to create machine learning solutions. This lab covers the model development portion of the data science workflow. A good machine learning solution excels that both accuracy and inference performance. \n",
|
||||
"\n",
|
||||
"<p><img src='images/pipeline_overview_2.png' width=1080></p>\n",
|
||||
"\n",
|
||||
"**Table of Contents**\n",
|
||||
"<br>\n",
|
||||
"This notebook covers the below sections: \n",
|
||||
"1. [Entity Embedding](#s4-1)\n",
|
||||
"2. [Training the Embeddings](#s4-2)\n",
|
||||
" * [Preparing the Data - Normalization](#s4-2.1)\n",
|
||||
" * [Model Building](#s4-2.2)\n",
|
||||
" * [Being Training](#s4-2.3)\n",
|
||||
"3. [Visualizing the Embeddings](#s4-3)\n",
|
||||
"4. [Conclusion](#s4-4)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "28538773-6b95-4840-aca2-73a6f7d98b07",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a name='s4-1'></a>\n",
|
||||
"## Entity Embeddings ##\n",
|
||||
"[Entity Embeddings](https://arxiv.org/pdf/1604.06737.pdf) are very similar to word embeddings used in NLP. They are a way to represent categorical features in a defined latent space. In the latent space, categories that are semantically similar have similar vectors. Embeddings can be trained to assign a learnable feature vector to each category. Using embeddings, each categorical value is mapped to its own associated vector representation that is more informative than a single point value. Even though embeddings require a large amount of data and computational resources to train, they have proven to be a great alternative encoding method to consider. Once trained, embeddings can boost the performance of downstream machine learning tasks when used as the input features. Users can combine the power of deep learning with traditional machine learning on tabular data. \n",
|
||||
"\n",
|
||||
"<p><img src='images/embedding.png' width=720></p>\n",
|
||||
"\n",
|
||||
"Reasons for using embeddings include: \n",
|
||||
"* It is much more efficient than the one-hot approach for encoding when cardinality if high\n",
|
||||
"* Allows rich relationships and complexities between categories to be captured\n",
|
||||
"* Reduce memory usage and speed up downstream machine learning model training\n",
|
||||
"* Once trained, the same embedding can be used for various use cases\n",
|
||||
"* Can be used to visualize categorical data and for data clustering, since the embedding space quantifies semantic similarity as distance between the categories in the latent space\n",
|
||||
"* Mitigates the need to perform cumbersome manual feature engineering, which requires extensive domain knowledge\n",
|
||||
"\n",
|
||||
"<p><img src='images/tip.png' width=720></p>\n",
|
||||
"\n",
|
||||
"Below are some tips about embeddings: \n",
|
||||
"* Requires training with large amounts of data, making it inappropriate for unseen data such as when new categories are added\n",
|
||||
"* Can overfit\n",
|
||||
"* Difficult to interpret"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "6ba4160d-4b41-40d3-93bc-f1fae0b9dddc",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a name='s4-2'></a>\n",
|
||||
"## Training the Embeddings ##\n",
|
||||
"Embeddings aim to represent each entity as a numeric vector such that products in similar context have similar vectors. Mathematically, similar entities will have a large dot product whereas every entity when one-hot encoded has a zero dot product with every other entity. This is because all one-hot vectors are orthogonal. \n",
|
||||
"\n",
|
||||
"We will use [PyTorch](https://pytorch.org/) to train a simple fully-connected neural network. A surrogate problem is setup for the purpose of finding the embedding vectors. Neural networks have difficultly with sparse categorical features. Traditionally, embeddings are a way to reduce those features to increase model performance. \n",
|
||||
"\n",
|
||||
"Technically, the idea of an embedding layer is very similar to a dense or linear layer (without bias) in the neural network. When training an embedding this way, users will one-hot encode the categorical data so each record becomes a vector with C features, where C is the cardinality. We then perform matrix vector multiplication on the input vector and the weights before feeding the next layer. This is inefficient when the number of input features is large and sparse, as is the case for categorical features from a tabular dataset. \n",
|
||||
"\n",
|
||||
"A better and more efficient approach would be to train a `torch.nn.Embedding` layer, which can be treated as a \"lookup\" table with the label-encoded category id as the index. By using choosing this, we avoid one-hot encoding and the matrix vector multiplication. \n",
|
||||
"\n",
|
||||
"<p><img src='images/surrogate_problem.png' width=720></p>\n",
|
||||
"\n",
|
||||
"<p><img src='images/tip.png' width=720></p>\n",
|
||||
"\n",
|
||||
"Embeddings will naturally be affected by how the surrogate problem is defined. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 1,
|
||||
"id": "ec50a570-247f-4cfc-8dc5-2c2b501de703",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# import dependencies\n",
|
||||
"from tqdm import tqdm\n",
|
||||
"import cudf\n",
|
||||
"import cuml\n",
|
||||
"import dask_cudf\n",
|
||||
"\n",
|
||||
"import torch\n",
|
||||
"import torch.nn as nn\n",
|
||||
"import torch.nn.functional as F\n",
|
||||
"import torch.optim as torch_optim\n",
|
||||
"from torch.utils.data import Dataset, DataLoader"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 2,
|
||||
"id": "036bf6ee-d5cb-4f20-a591-681706a098ac",
|
||||
"metadata": {
|
||||
"scrolled": true,
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# set device cuda to use GPU\n",
|
||||
"device=torch.device('cuda')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 3,
|
||||
"id": "3726fc69-2a2b-42e2-be12-d235ce2322c1",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# define features and label\n",
|
||||
"cols=['brand', 'cat_0', 'cat_1', 'cat_2', 'price', 'target']\n",
|
||||
"cat_cols=['brand', 'cat_0', 'cat_1', 'cat_2']\n",
|
||||
"label='target'\n",
|
||||
"\n",
|
||||
"feature_cols=[col for col in cols if col != label]\n",
|
||||
"cont_cols=[col for col in feature_cols if col not in cat_cols] # ['price']"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 4,
|
||||
"id": "ae87d23f-0c67-4758-8842-ca5770e740f9",
|
||||
"metadata": {
|
||||
"scrolled": true,
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"Total of 2461697 records.\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# read data\n",
|
||||
"parquet_dir='processed_parquet'\n",
|
||||
"\n",
|
||||
"ddf=dask_cudf.read_parquet(parquet_dir, columns=cols)\n",
|
||||
"gdf=ddf.compute()\n",
|
||||
"\n",
|
||||
"print(f'Total of {len(gdf)} records.')"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "b9110c9d-5924-4cb2-8bf3-cabd398aad0e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<p><img src='images/tip.png' width=720></p>\n",
|
||||
"\n",
|
||||
"Even though we intend to keep all the data in one GPU, we still recommend loading data with `Dask-cuDF`. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 5,
|
||||
"id": "f782bc7e-e6c4-4d87-a839-5a99227dca7c",
|
||||
"metadata": {
|
||||
"scrolled": true,
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/html": [
|
||||
"<div>\n",
|
||||
"<style scoped>\n",
|
||||
" .dataframe tbody tr th:only-of-type {\n",
|
||||
" vertical-align: middle;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe tbody tr th {\n",
|
||||
" vertical-align: top;\n",
|
||||
" }\n",
|
||||
"\n",
|
||||
" .dataframe thead th {\n",
|
||||
" text-align: right;\n",
|
||||
" }\n",
|
||||
"</style>\n",
|
||||
"<table border=\"1\" class=\"dataframe\">\n",
|
||||
" <thead>\n",
|
||||
" <tr style=\"text-align: right;\">\n",
|
||||
" <th></th>\n",
|
||||
" <th>brand</th>\n",
|
||||
" <th>cat_0</th>\n",
|
||||
" <th>cat_1</th>\n",
|
||||
" <th>cat_2</th>\n",
|
||||
" <th>price</th>\n",
|
||||
" <th>target</th>\n",
|
||||
" </tr>\n",
|
||||
" </thead>\n",
|
||||
" <tbody>\n",
|
||||
" <tr>\n",
|
||||
" <th>0</th>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>100.229996</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>1</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>871.839966</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>2</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" <td>872.090027</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>3</th>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>6</td>\n",
|
||||
" <td>5</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>306.690002</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" <tr>\n",
|
||||
" <th>4</th>\n",
|
||||
" <td>13</td>\n",
|
||||
" <td>2</td>\n",
|
||||
" <td>3</td>\n",
|
||||
" <td>24</td>\n",
|
||||
" <td>334.349976</td>\n",
|
||||
" <td>1</td>\n",
|
||||
" </tr>\n",
|
||||
" </tbody>\n",
|
||||
"</table>\n",
|
||||
"</div>"
|
||||
],
|
||||
"text/plain": [
|
||||
" brand cat_0 cat_1 cat_2 price target\n",
|
||||
"0 1 6 5 2 100.229996 1\n",
|
||||
"1 2 1 1 1 871.839966 1\n",
|
||||
"2 2 1 1 1 872.090027 1\n",
|
||||
"3 2 6 5 2 306.690002 1\n",
|
||||
"4 13 2 3 24 334.349976 1"
|
||||
]
|
||||
},
|
||||
"execution_count": 5,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"gdf.head()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 6,
|
||||
"id": "3673f202-7aea-43a7-a569-4c210a614529",
|
||||
"metadata": {
|
||||
"scrolled": true,
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"{'brand': (3303, 7), 'cat_0': (14, 3), 'cat_1': (61, 3), 'cat_2': (90, 3)}"
|
||||
]
|
||||
},
|
||||
"execution_count": 6,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# the embedding vectors will start with 0 so we decrease the categorical values by 1 to match\n",
|
||||
"gdf[cat_cols]=gdf[cat_cols]-1\n",
|
||||
"\n",
|
||||
"n_uniques=gdf.nunique()\n",
|
||||
"\n",
|
||||
"# use higher of 4th root of nunique and 3 for vector dimension\n",
|
||||
"embedding_sizes={col: (n_uniques[col], max(3, int(n_uniques[col]**0.25))) for col in cat_cols}\n",
|
||||
"embedding_sizes"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a327c1f9-0683-45f1-90a6-6d4d4daa093c",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"<p><img src='images/tip.png' width=720></p>\n",
|
||||
"\n",
|
||||
"The size of embeddings can become very large. For example, large embeddings are usually needed for users and items for large platforms. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "2c1c7fee-dad0-4009-a55c-513465db8a7c",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a name='s4-2.1'></a>\n",
|
||||
"### Preparing the Data - Normalization ###\n",
|
||||
"**Normalization** is required to enable neural networks to leverage numerical features. Tree-based models do not require normalization as they define the split independent of the scale of a feature. Without normalization, neural networks are difficult to train. The reason is that different numerical features have different scales. When we combine the features in a hidden layer, the different scales make it more difficult to extract patterns from it. \n",
|
||||
"\n",
|
||||
"<p><img src='images/tip.png' width=720></p>\n",
|
||||
"\n",
|
||||
"We will also implement a `torch.nn.BatchNorm1d`[[doc]](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html) layer to mitigate the exploding gradient problem. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 7,
|
||||
"id": "fb1840b3-a7d8-4b91-98ef-bddf59afd5e6",
|
||||
"metadata": {
|
||||
"scrolled": true,
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# normalize data\n",
|
||||
"gdf['price']=cuml.preprocessing.StandardScaler().fit_transform(gdf[['price']])"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "d6991948-f79a-4b51-b3a9-2571b2be5262",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"source": [
|
||||
"<a name='s4-2.2'></a>\n",
|
||||
"### Model Building ###\n",
|
||||
"We construct a model with several layers. The embeddings will be the same dimension as num_unique x vector_size. The embeddings will be concatenated, along with the continous variable(s), before they are fed into the next layer. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 8,
|
||||
"id": "35a8055b-8b7b-4fb8-8d3a-9f36fc03b171",
|
||||
"metadata": {
|
||||
"scrolled": true,
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# define neural network with embedding layers\n",
|
||||
"class ProductPurchaseModel(nn.Module):\n",
|
||||
" def __init__(self, embedding_sizes, n_cont):\n",
|
||||
" super().__init__()\n",
|
||||
" # make an embedding for each categorical feature\n",
|
||||
" # The `nn.Embedding` layer can be thought of as a lookup table where the key is \n",
|
||||
" # the category index and the value is the corresponding embedding vector\n",
|
||||
" self.embeddings=nn.ModuleList([nn.Embedding(n_categories, size) for n_categories, size in embedding_sizes.values()])\n",
|
||||
" \n",
|
||||
" # n_emb is the length of all embeddings combined\n",
|
||||
" n_emb=sum(e.embedding_dim for e in self.embeddings)\n",
|
||||
" \n",
|
||||
" self.n_emb=n_emb\n",
|
||||
" self.n_cont=n_cont\n",
|
||||
" self.emb_drop = nn.Dropout(0.6)\n",
|
||||
" \n",
|
||||
" # apply dropout, batch norm and linear layers\n",
|
||||
" self.bn1=nn.BatchNorm1d(self.n_cont)\n",
|
||||
" self.lin1=nn.Linear(self.n_emb + self.n_cont, 200)\n",
|
||||
" self.drop1=nn.Dropout(0.3)\n",
|
||||
" self.bn2=nn.BatchNorm1d(200)\n",
|
||||
" self.drop2=nn.Dropout(0.3)\n",
|
||||
" self.lin2=nn.Linear(200, 70)\n",
|
||||
" self.bn3=nn.BatchNorm1d(70)\n",
|
||||
" self.lin3=nn.Linear(70, 2)\n",
|
||||
"\n",
|
||||
" def forward(self, X_cat, X_cont):\n",
|
||||
" # map each categorical feature to the embedding vector on its corresponding embedding layer\n",
|
||||
" x_1=[embedding(X_cat[:, idx]) for idx, embedding in enumerate(self.embeddings)]\n",
|
||||
" \n",
|
||||
" # concatenate all categorical embedding vectors together\n",
|
||||
" x_1=torch.cat(x_1, 1)\n",
|
||||
" \n",
|
||||
" # apply random drop out, normalization, and activation\n",
|
||||
" x_1=self.emb_drop(x_1)\n",
|
||||
" x_2=self.bn1(X_cont)\n",
|
||||
" \n",
|
||||
" # concatenate categorical embeddings to input layer from continuous variable(s)\n",
|
||||
" x_1=torch.cat([x_1, x_2], 1)\n",
|
||||
" \n",
|
||||
" # apply random drop out, normalization, and activation\n",
|
||||
" x_1=F.relu(self.lin1(x_1))\n",
|
||||
" x_1=self.drop1(x_1)\n",
|
||||
" x_1=self.bn2(x_1)\n",
|
||||
" x_1=F.relu(self.lin2(x_1))\n",
|
||||
" x_1=self.drop2(x_1)\n",
|
||||
" x_1=self.bn3(x_1)\n",
|
||||
" x_1=self.lin3(x_1)\n",
|
||||
" return x_1"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "c52e50a2-99b6-4a8c-aa65-5f11a7806c6e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<p><img src='images/tip.png' width=720></p>\n",
|
||||
"\n",
|
||||
"Tabular data uses shallow models with huge embedding tables and few feed-forward layers. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 9,
|
||||
"id": "5b7d18b1-d29e-43d4-8091-3aba41968ebf",
|
||||
"metadata": {
|
||||
"scrolled": true,
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"data": {
|
||||
"text/plain": [
|
||||
"ProductPurchaseModel(\n",
|
||||
" (embeddings): ModuleList(\n",
|
||||
" (0): Embedding(3303, 7)\n",
|
||||
" (1): Embedding(14, 3)\n",
|
||||
" (2): Embedding(61, 3)\n",
|
||||
" (3): Embedding(90, 3)\n",
|
||||
" )\n",
|
||||
" (emb_drop): Dropout(p=0.6, inplace=False)\n",
|
||||
" (bn1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
|
||||
" (lin1): Linear(in_features=17, out_features=200, bias=True)\n",
|
||||
" (drop1): Dropout(p=0.3, inplace=False)\n",
|
||||
" (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
|
||||
" (drop2): Dropout(p=0.3, inplace=False)\n",
|
||||
" (lin2): Linear(in_features=200, out_features=70, bias=True)\n",
|
||||
" (bn3): BatchNorm1d(70, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
|
||||
" (lin3): Linear(in_features=70, out_features=2, bias=True)\n",
|
||||
")"
|
||||
]
|
||||
},
|
||||
"execution_count": 9,
|
||||
"metadata": {},
|
||||
"output_type": "execute_result"
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"# instantiate model\n",
|
||||
"model=ProductPurchaseModel(embedding_sizes, len(cont_cols))\n",
|
||||
"model.to(device)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "f35dab8e-f1cd-484b-999e-b9e0f7e79edd",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Next, we define a `torch.utils.data.Dataset` class to be use by `torch.utils.data.DataLoader`. The Dataset is makes it easier to track separate categorical and continuous variables. The DatalLoader wraps an iterable around the Dataset to enable easy access to the samples. More information about Dataset and DataLoader can be found in quick PyTorch [guide](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html). "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 10,
|
||||
"id": "98f74906-7b79-4fda-8626-df17023ee512",
|
||||
"metadata": {
|
||||
"scrolled": true,
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# define dataset\n",
|
||||
"class myDataset(Dataset):\n",
|
||||
" def __init__(self, X, y, cat_cols, cont_cols):\n",
|
||||
" self.X_cat=torch.as_tensor(X.loc[:, cat_cols].copy().values.astype('int32'), device=device)\n",
|
||||
" self.X_cont=torch.as_tensor(X.loc[:, cont_cols].copy().values.astype('float32'), device=device)\n",
|
||||
" self.y=torch.as_tensor(y.astype('int64'), device=device)\n",
|
||||
" \n",
|
||||
" def __len__(self):\n",
|
||||
" return len(self.y)\n",
|
||||
" \n",
|
||||
" def __getitem__(self, idx): \n",
|
||||
" return self.X_cat[idx], self.X_cont[idx], self.y[idx]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 11,
|
||||
"id": "a0973509-6a11-49d8-b346-ab9ec8cfaef5",
|
||||
"metadata": {
|
||||
"scrolled": true,
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# instantiate dataset\n",
|
||||
"X_train=gdf[feature_cols]\n",
|
||||
"y_train=gdf['target'].values\n",
|
||||
"\n",
|
||||
"train_ds=myDataset(X_train, y_train, cat_cols, cont_cols)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "5336cfd0-39ed-4285-9b66-e4f5d1b7d75e",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a name='s4-2.3'></a>\n",
|
||||
"### Begin Training ###\n",
|
||||
"We will set some parameters for training. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 12,
|
||||
"id": "0604708e-1c2c-485b-a029-eadd17356a03",
|
||||
"metadata": {
|
||||
"scrolled": true,
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# set optimizer\n",
|
||||
"def get_optimizer(model, lr = 0.001, wd = 0.0):\n",
|
||||
" parameters=filter(lambda p: p.requires_grad, model.parameters())\n",
|
||||
" optim=torch_optim.Adam(parameters, lr=lr, weight_decay=wd)\n",
|
||||
" return optim"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 13,
|
||||
"id": "39e0ce25-f65c-4330-98cc-34ee4b30bae4",
|
||||
"metadata": {
|
||||
"scrolled": true,
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# define training function\n",
|
||||
"def train_model(model, optim, train_dl):\n",
|
||||
" # set the model to training, which is useful for BatchNorm and Dropout layers that behave differently during training and evaluation\n",
|
||||
" model.train()\n",
|
||||
" total=0\n",
|
||||
" sum_loss=0\n",
|
||||
" \n",
|
||||
" # iterate through batches\n",
|
||||
" for b, (X_cat, X_cont, y) in enumerate(train_dl):\n",
|
||||
" batch=y.shape[0]\n",
|
||||
" \n",
|
||||
" # forward pass\n",
|
||||
" output=model(X_cat, X_cont)\n",
|
||||
" \n",
|
||||
" # calculate loss\n",
|
||||
" loss=F.cross_entropy(output, y)\n",
|
||||
" \n",
|
||||
" # zero out the gradients so the parameters update correctly, otherwise gradients would be combined with old\n",
|
||||
" optim.zero_grad()\n",
|
||||
" loss.backward()\n",
|
||||
" optim.step()\n",
|
||||
" \n",
|
||||
" # calculate total loss per batch\n",
|
||||
" total+=batch\n",
|
||||
" sum_loss+=batch*(loss.item())\n",
|
||||
" return sum_loss/total"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "a60dd511-3121-4eb0-beb7-3a03d56de202",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"Instantiate a `torch.utils.data.DataLoader` and begin training. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": 14,
|
||||
"id": "5a25e4e6-f0b5-4bbc-8a1d-0eee74c7faaf",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# define training loop\n",
|
||||
"def train_loop(model, epochs, lr=0.01, wd=0.0):\n",
|
||||
" # instantiate optimizer\n",
|
||||
" optim=get_optimizer(model, lr = lr, wd = wd)\n",
|
||||
" \n",
|
||||
" # iterate through number of epochs\n",
|
||||
" for i in tqdm(range(epochs)): \n",
|
||||
" loss=train_model(model, optim, train_dl)\n",
|
||||
" print(\"training loss: \", round(loss, 3))"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "68b43459-fb0a-4c13-9371-7c15327ff624",
|
||||
"metadata": {
|
||||
"scrolled": true,
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 33%|███▎ | 1/3 [00:28<00:57, 28.79s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"training loss: 0.666\n"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stderr",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
" 67%|██████▋ | 2/3 [00:57<00:28, 28.67s/it]"
|
||||
]
|
||||
},
|
||||
{
|
||||
"name": "stdout",
|
||||
"output_type": "stream",
|
||||
"text": [
|
||||
"training loss: 0.665\n"
|
||||
]
|
||||
}
|
||||
],
|
||||
"source": [
|
||||
"%%time\n",
|
||||
"\n",
|
||||
"# define batch size and begin training\n",
|
||||
"batch_size=1000\n",
|
||||
"train_dl=DataLoader(train_ds, batch_size=batch_size, shuffle=True)\n",
|
||||
"\n",
|
||||
"train_loop(model, epochs=3, lr=0.05, wd=0.00001)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "7d6656b3-3642-4279-b787-0c034c45b739",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a name='s4-3'></a>\n",
|
||||
"## Visualizing the Embeddings ##"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "20973ee4-a723-4931-bf50-8efffe275026",
|
||||
"metadata": {
|
||||
"tags": []
|
||||
},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# visualize embeddings\n",
|
||||
"\n",
|
||||
"# import dependencies\n",
|
||||
"import plotly.express as px\n",
|
||||
"import pandas as pd\n",
|
||||
"\n",
|
||||
"# pick category to visualize\n",
|
||||
"category='brand'\n",
|
||||
"\n",
|
||||
"category_label=pd.read_parquet(f'categories/unique.{category}.parquet')[category]\n",
|
||||
"category_label=category_label[1:]\n",
|
||||
"\n",
|
||||
"embeddings_idx=list(embedding_sizes.keys()).index(category)\n",
|
||||
"embeddings=model.embeddings[embeddings_idx].weight.detach().cpu().numpy()\n",
|
||||
"\n",
|
||||
"fig=px.scatter_3d(\n",
|
||||
" x=embeddings[:, 0], \n",
|
||||
" y=embeddings[:, 1], \n",
|
||||
" z=embeddings[:, 2], \n",
|
||||
" text=category_label, \n",
|
||||
" height=720\n",
|
||||
")\n",
|
||||
"fig.show()"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "code",
|
||||
"execution_count": null,
|
||||
"id": "130a2b16-89e5-4eda-8155-014a75a3638e",
|
||||
"metadata": {},
|
||||
"outputs": [],
|
||||
"source": [
|
||||
"# persist embeddings\n",
|
||||
"!mkdir trained_embedding_weights\n",
|
||||
"\n",
|
||||
"import pandas as pd\n",
|
||||
"import numpy as np\n",
|
||||
"\n",
|
||||
"for idx, each_col in enumerate(cat_cols): \n",
|
||||
" weights=model.embeddings[idx].weight.detach().cpu().numpy()\n",
|
||||
" pd.DataFrame(weights).to_csv(f'trained_embedding_weights/{each_col}.csv', index=False)"
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "bc7cce0e-6dcb-4d5a-82dd-e8074abaaaec",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a name='s4-4'></a>\n",
|
||||
"## Conclusion ##\n",
|
||||
"Deep Learning is very good at feature extraction, which can be used for finding categorical embeddings. This is the advantage of using a Deep Learning approach, as it requires way less feature engineering and less dependent on domain knowledge. "
|
||||
]
|
||||
},
|
||||
{
|
||||
"cell_type": "markdown",
|
||||
"id": "997bd6f7-9efb-4fee-b3d4-9d4454694c7b",
|
||||
"metadata": {},
|
||||
"source": [
|
||||
"<a href=\"https://www.nvidia.com/dli\"> <img src=\"images/DLI_Header.png\" alt=\"Header\" style=\"width: 400px;\"/> </a>"
|
||||
]
|
||||
}
|
||||
],
|
||||
"metadata": {
|
||||
"kernelspec": {
|
||||
"display_name": "Python 3 (ipykernel)",
|
||||
"language": "python",
|
||||
"name": "python3"
|
||||
},
|
||||
"language_info": {
|
||||
"codemirror_mode": {
|
||||
"name": "ipython",
|
||||
"version": 3
|
||||
},
|
||||
"file_extension": ".py",
|
||||
"mimetype": "text/x-python",
|
||||
"name": "python",
|
||||
"nbconvert_exporter": "python",
|
||||
"pygments_lexer": "ipython3",
|
||||
"version": "3.9.16"
|
||||
}
|
||||
},
|
||||
"nbformat": 4,
|
||||
"nbformat_minor": 5
|
||||
}
|
||||
736
5/data science/3/3_01_model_deployment_for_inference.ipynb
Normal file
1552
5/data science/3/assessment.ipynb
Normal file
BIN
5/data science/3/images/DLI_Header.png
Normal file
|
After Width: | Height: | Size: 26 KiB |
BIN
5/data science/3/images/XGBoost.png
Normal file
|
After Width: | Height: | Size: 56 KiB |
BIN
5/data science/3/images/active_kernel_files.png
Normal file
|
After Width: | Height: | Size: 200 KiB |
BIN
5/data science/3/images/agg.png
Normal file
|
After Width: | Height: | Size: 282 KiB |
BIN
5/data science/3/images/better_view.png
Normal file
|
After Width: | Height: | Size: 192 KiB |
BIN
5/data science/3/images/check.png
Normal file
|
After Width: | Height: | Size: 24 KiB |
BIN
5/data science/3/images/client_scheduler.png
Normal file
|
After Width: | Height: | Size: 340 KiB |
BIN
5/data science/3/images/credit.png
Normal file
|
After Width: | Height: | Size: 792 KiB |
BIN
5/data science/3/images/dashboard_status.png
Normal file
|
After Width: | Height: | Size: 270 KiB |
BIN
5/data science/3/images/dask.png
Normal file
|
After Width: | Height: | Size: 230 KiB |
BIN
5/data science/3/images/dask_dag.png
Normal file
|
After Width: | Height: | Size: 942 KiB |
BIN
5/data science/3/images/dask_dataframe.png
Normal file
|
After Width: | Height: | Size: 152 KiB |
BIN
5/data science/3/images/dask_diagnostics_1.png
Normal file
|
After Width: | Height: | Size: 45 KiB |
BIN
5/data science/3/images/dask_diagnostics_2.png
Normal file
|
After Width: | Height: | Size: 70 KiB |
BIN
5/data science/3/images/dask_diagnostics_sample_1.png
Normal file
|
After Width: | Height: | Size: 74 KiB |
BIN
5/data science/3/images/dask_diagnostics_sample_2_combined.png
Normal file
|
After Width: | Height: | Size: 706 KiB |
BIN
5/data science/3/images/dask_stats.png
Normal file
|
After Width: | Height: | Size: 176 KiB |
BIN
5/data science/3/images/dask_widget.png
Normal file
|
After Width: | Height: | Size: 859 KiB |
BIN
5/data science/3/images/data_loading_and_eda.png
Normal file
|
After Width: | Height: | Size: 129 KiB |
BIN
5/data science/3/images/data_preview.png
Normal file
|
After Width: | Height: | Size: 25 KiB |
BIN
5/data science/3/images/diagnostic_1.png
Normal file
|
After Width: | Height: | Size: 320 KiB |
BIN
5/data science/3/images/diagnostic_2.png
Normal file
|
After Width: | Height: | Size: 374 KiB |
BIN
5/data science/3/images/ds_on_gpu.png
Normal file
|
After Width: | Height: | Size: 66 KiB |
BIN
5/data science/3/images/dtypes.png
Normal file
|
After Width: | Height: | Size: 222 KiB |
BIN
5/data science/3/images/embedding.png
Normal file
|
After Width: | Height: | Size: 140 KiB |
BIN
5/data science/3/images/feature_engineering_methods.png
Normal file
|
After Width: | Height: | Size: 107 KiB |
BIN
5/data science/3/images/groupby.png
Normal file
|
After Width: | Height: | Size: 241 KiB |
BIN
5/data science/3/images/groupby_aggregation.png
Normal file
|
After Width: | Height: | Size: 75 KiB |
BIN
5/data science/3/images/groupby_aggregation_comp.png
Normal file
|
After Width: | Height: | Size: 743 KiB |
BIN
5/data science/3/images/groupby_apply.png
Normal file
|
After Width: | Height: | Size: 159 KiB |
BIN
5/data science/3/images/important.png
Normal file
|
After Width: | Height: | Size: 21 KiB |
BIN
5/data science/3/images/jl_launcher.png
Normal file
|
After Width: | Height: | Size: 43 KiB |
BIN
5/data science/3/images/kernel.png
Normal file
|
After Width: | Height: | Size: 9.2 KiB |
BIN
5/data science/3/images/kernel_menu.png
Normal file
|
After Width: | Height: | Size: 170 KiB |
BIN
5/data science/3/images/kernel_name.png
Normal file
|
After Width: | Height: | Size: 148 KiB |
BIN
5/data science/3/images/mem_usage.png
Normal file
|
After Width: | Height: | Size: 11 KiB |
BIN
5/data science/3/images/multi-gpu.png
Normal file
|
After Width: | Height: | Size: 952 KiB |
BIN
5/data science/3/images/nvtabular_diagram.png
Normal file
|
After Width: | Height: | Size: 186 KiB |
BIN
5/data science/3/images/nvtabular_operators.png
Normal file
|
After Width: | Height: | Size: 442 KiB |
BIN
5/data science/3/images/parquet.png
Normal file
|
After Width: | Height: | Size: 56 KiB |
BIN
5/data science/3/images/pipeline_overview.png
Normal file
|
After Width: | Height: | Size: 70 KiB |
BIN
5/data science/3/images/pipeline_overview_1.png
Normal file
|
After Width: | Height: | Size: 75 KiB |
BIN
5/data science/3/images/pipeline_overview_2.png
Normal file
|
After Width: | Height: | Size: 72 KiB |
BIN
5/data science/3/images/product_embedding.png
Normal file
|
After Width: | Height: | Size: 571 KiB |
BIN
5/data science/3/images/random_forest.png
Normal file
|
After Width: | Height: | Size: 30 KiB |
BIN
5/data science/3/images/rf_train.png
Normal file
|
After Width: | Height: | Size: 514 KiB |
BIN
5/data science/3/images/running_kernels.png
Normal file
|
After Width: | Height: | Size: 166 KiB |
BIN
5/data science/3/images/surrogate_problem.png
Normal file
|
After Width: | Height: | Size: 112 KiB |
BIN
5/data science/3/images/tabular_data_analytics.png
Normal file
|
After Width: | Height: | Size: 256 KiB |
BIN
5/data science/3/images/tabular_data_features.png
Normal file
|
After Width: | Height: | Size: 262 KiB |
BIN
5/data science/3/images/tabular_data_workflow.png
Normal file
|
After Width: | Height: | Size: 265 KiB |
BIN
5/data science/3/images/tip.png
Normal file
|
After Width: | Height: | Size: 18 KiB |
BIN
5/data science/3/images/triton_server_architecture.png
Normal file
|
After Width: | Height: | Size: 207 KiB |
BIN
5/data science/3/images/watch_gpu.png
Normal file
|
After Width: | Height: | Size: 131 KiB |