This commit is contained in:
2026-02-17 23:13:20 +03:00
parent 65218abfb1
commit e52dde575a
429 changed files with 875 additions and 14 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

View File

@ -0,0 +1,29 @@
CC = nvcc -arch=sm_75
DEBUG ?= false
DIRS = dist build
ifeq ($(DEBUG), false)
CC += -O3
else
CC += -g -G
endif
.PHONY: all run
all: $(DIRS) dist/app
dist/app: build/main.o build/op.o
$(CC) $^ -o $@ -lcuda
build/op.o: op.ptx
$(CC) $^ -dc -o $@
build/main.o: main.cu
$(CC) $^ -ptx -o build/main.ptx
$(CC) $^ -rdc=true -dc -o $@
$(DIRS):
mkdir -p $@
clean:
rm -rf $(DIRS)

View File

@ -0,0 +1,23 @@
[euclidean](https://web.archive.org/web/20230212044931/http://www-math.ucdenver.edu/~wcherowi/courses/m5410/exeucalg.html)
[ecdsa1](https://sefiks.com/2018/02/16/elegant-signatures-with-elliptic-curve-cryptography/)
[ecdsa2](https://learnmeabitcoin.com/technical/cryptography/elliptic-curve/ecdsa/)
[ptx](https://philipfabianek.com/posts/cuda-ptx-introduction)
высокий приоритет
6, 7 State Spaces / Properties of State Spaces Ключевое отличие от CPU! В CPU память в основном плоская (RAM, кэш). В GPU есть много типов памяти: глобальная (.global), общая для блока потоков (.shared), константная (.const), локальная (.local) и т.д. Это фундамент для написания производительного кода.
19 Cost Estimates for Accessing State-Spaces Прямое продолжение предыдущего пункта. Объясняет, какая память быстрая, а какая медленная. Критично для оптимизации.
4 Operator Precedence Синтаксис PTX похож на ассемблер, но с выражениями. Знать приоритет операторов необходимо.
8 Fundamental Type Specifiers Типы данных в PTX (.b8, .s16, .f32, .b64 и т.д.). Аналог byte, word, dword в x86, но с учетом специфики GPU.
3 Predefined Identifiers Предопределенные константы, такие как %tid, %ctaid, %ntid. Это основа модели выполнения CUDA! Вместо одного потока (RIP/EIP) у вас есть идентификаторы потока, блока и сетки.
20 Operation Types Классификация инструкций PTX. Поможет быстро ориентироваться в мануале.
1 PTX Directives Директивы ассемблера (.version, .target, .global). Аналог секций и директив в NASM (SECTION .text, global _start)
средний приоритет
21 Scopes Области видимости для атомарных операций и барьеров (.cta, .cluster, .gpu, .sys). Важно для синхронизации.
14, 40, 56 Различные таблицы про Swizzling и Layout Касаются продвинутых техник работы с памятью и матрицами для оптимизации доступа. Актуально для low-level оптимизаций, похоже на работу с выравниванием и SIMD в x86.
29 Summary of Floating-Point Instructions Обзор инструкций для чисел с плавающей точкой. На GPU они крайне важны.
30-32 Cache Operators / Eviction Priority Hints Управление кэшем. Продвинутая тема для тонкой настройки, аналогичная prefetch-инструкциям в x86.
53, 55, 56 Таблицы про MMA (Matrix Multiply-Accumulate) Инструкции для тензорных ядер (аналог FMA в x86, но для матриц). Сердце производительности в AI/HPC.
22-25 Comparison Operators Особенности сравнений для целых и вещественных чисел (учет NaN).

161
5/data science/1e/main.cu Normal file
View File

@ -0,0 +1,161 @@
#include <stdint.h>
template <typename T, int TILE_SIZE>
__global__ void mat_mul(T *A, T *B, T *C, int N, int M, int K) {
__shared__ T sA[TILE_SIZE][TILE_SIZE];
__shared__ T sB[TILE_SIZE][TILE_SIZE];
int bx = blockIdx.x, by = blockIdx.y;
int tx = threadIdx.x, ty = threadIdx.y;
int row = by * TILE_SIZE + ty;
int col = bx * TILE_SIZE + tx;
if (col >= K || row >= M) return;
T sum = 0;
int tiles_len = (M + TILE_SIZE - 1) / TILE_SIZE;
for (int tile = 0; tile < tiles_len; tile++) {
int aCol = tile * TILE_SIZE + tx;
int bRow = tile * TILE_SIZE + ty;
if (aCol < M) {
sA[ty][tx] = A[row * M + aCol];
} else {
sA[ty][tx] = 0;
}
sB[ty][tx] = (T)((uint64_t)B[bRow * K + col] & ((uint64_t)(bRow >= M) - 1));
__syncthreads();
for (int k = 0; k < TILE_SIZE; k++) {
sum += sA[ty][k] * sB[k][tx];
}
}
C[row * K + col] = sum;
}
template <typename T>
__global__ void dumb_mat_mul(T *A, T *B, T *C, int N, int M, int K) {
int col = blockIdx.x * blockDim.x + threadIdx.x;
int row = blockIdx.y * blockDim.y + threadIdx.y;
if (col >= K || row >= M) return;
T sum = 0;
for (int i = 0; i < M; i++) {
sum += A[row * M + i] * B[i * K + col];
}
C[row * K + col] = sum;
}
#define N 1024
#define M 1024
#define K 1024
#define NO_PRINT 1
#define GRID_DIM 1
#define BLOCK_DIM 32
#define MAT_TYPE int
#define MAT_FMT "%d\t"
#define A_LEN (N * M)
#define B_LEN (M * K)
#define C_LEN (N * K)
#define A_SIZE (sizeof(MAT_TYPE) * N * M)
#define B_SIZE (sizeof(MAT_TYPE) * M * K)
#define C_SIZE (sizeof(MAT_TYPE) * N * K)
#include <cstdio>
#include <random>
#include <chrono>
using namespace std::chrono;
template <typename T>
void mat_print(T *a, const char *fmt, int n, int m) {
for (auto row = 0; row < n; row++) {
for (auto col = 0; col < m; col++) {
printf(fmt, a[row * m + col]);
}
printf("\n");
}
}
int main() {
std::random_device rd;
std::mt19937 engine(rd());
std::uniform_int_distribution<MAT_TYPE> dist(1, 10);
auto buf = (MAT_TYPE *)malloc(A_SIZE + B_SIZE + C_SIZE);
for (auto i = 0; i < A_LEN + B_LEN; i++) {
buf[i] = dist(engine);
}
MAT_TYPE *a = buf;
MAT_TYPE *b = a + A_LEN;
MAT_TYPE *c = b + B_LEN;
#if NO_PRINT==0
printf("\na\n");
mat_print(a, MAT_FMT, N, M);
printf("\nb\n");
mat_print(b, MAT_FMT, M, K);
#endif
MAT_TYPE *d_a, *d_b, *d_c;
cudaMalloc(&d_a, A_SIZE);
cudaMalloc(&d_b, B_SIZE);
cudaMalloc(&d_c, C_SIZE);
cudaMemcpy(d_a, a, A_SIZE, cudaMemcpyHostToDevice);
cudaMemcpy(d_b, b, B_SIZE, cudaMemcpyHostToDevice);
dim3 gridDim(GRID_DIM, GRID_DIM);
dim3 blockDim(BLOCK_DIM, BLOCK_DIM);
int cycles = 0;
microseconds duration(0);
while (duration.count() < 1e6) {
auto start = high_resolution_clock::now();
mat_mul<MAT_TYPE, BLOCK_DIM><<<gridDim, blockDim>>>(d_a, d_b, d_c, N, M, K);
cudaDeviceSynchronize();
auto end = high_resolution_clock::now();
cycles++;
duration += duration_cast<microseconds>(end - start);
}
#if NO_PRINT==0
cudaMemcpy(c, d_c, C_SIZE, cudaMemcpyDeviceToHost);
printf("\nc\n");
mat_print(c, MAT_FMT, N, K);
#endif
printf("optimized mul take %f usec avg in %d cycles\n", (float)(duration.count()) / cycles, cycles);
cycles = 0;
duration = microseconds(0);
while (duration.count() < 1e6) {
auto start = high_resolution_clock::now();
dumb_mat_mul<MAT_TYPE><<<gridDim, blockDim>>>(d_a, d_b, d_c, N, M, K);
cudaDeviceSynchronize();
auto end = high_resolution_clock::now();
cycles++;
duration += duration_cast<microseconds>(end - start);
}
#if NO_PRINT==0
cudaMemcpy(c, d_c, C_SIZE, cudaMemcpyDeviceToHost);
printf("\nc\n");
mat_print(c, MAT_FMT, N, K);
#endif
printf("dumb mul take %f usec avg in %d cycles\n", (float)(duration.count()) / cycles, cycles);
cudaFree(a);
cudaFree(b);
cudaFree(c);
free(buf);
}

27
5/data science/1e/main.py Normal file
View File

@ -0,0 +1,27 @@
import sys, time, math
import numpy as np
import cupy as cp
def measure(a, b):
duration = 0
cycles = 0
while (duration < 1):
start = time.perf_counter()
c = a @ b
cp.cuda.Stream.null.synchronize()
end = time.perf_counter()
duration += end - start
cycles += 1
return duration / cycles
n = 1024
a = np.random.rand(n, n).astype(np.float32)
b = np.random.rand(n, n).astype(np.float32)
print('numpy take', measure(a, b) * 1e6, 'usec')
a = cp.random.rand(n, n, dtype = cp.float32)
b = cp.random.rand(n, n, dtype = cp.float32)
print('cupy take', measure(a, b) * 1e6, 'usec')

171
5/data science/1e/op.ptx Normal file
View File

@ -0,0 +1,171 @@
.version 8.4
.target sm_75
.address_size 64
.visible .func add_u16(
.param .b64 out_c,
.param .align 16 .b8 in_a[16],
.param .align 16 .b8 in_b[16]
) {
.reg .u64 %ra<2>, %rb<2>;
.reg .b64 %rdc;
ld.param.b64 %rdc, [out_c];
ld.param.v2.u64 {%ra1, %ra0}, [in_a];
ld.param.v2.u64 {%rb1, %rb0}, [in_b];
add.cc.u64 %ra0, %ra0, %rb0;
addc.u64 %ra1, %ra1, %rb1;
st.v2.u64 [%rdc], {%ra1, %ra0};
ret;
}
.visible .func sub_u16(
.param .b64 out_c,
.param .align 16 .b8 in_a[16],
.param .align 16 .b8 in_b[16]
) {
.reg .u64 %ra<2>, %rb<2>;
.reg .b64 %rdc;
ld.param.b64 %rdc, [out_c];
ld.param.v2.u64 {%ra1, %ra0}, [in_a];
ld.param.v2.u64 {%rb1, %rb0}, [in_b];
sub.cc.u64 %ra0, %ra0, %rb0;
subc.u64 %ra1, %ra1, %rb1;
st.v2.u64 [%rdc], {%ra1, %ra0};
ret;
}
.visible .func add_u32(
.param .b64 out_c,
.param .align 16 .b8 in_a[32],
.param .align 16 .b8 in_b[32]
) {
.reg .u64 %ra<4>, %rb<4>;
.reg .b64 %rdc;
ld.param.b64 %rdc, [out_c];
ld.param.v2.u64 {%ra3, %ra2}, [in_a];
ld.param.v2.u64 {%ra1, %ra0}, [in_a + 16];
ld.param.v2.u64 {%rb3, %rb2}, [in_b];
ld.param.v2.u64 {%rb1, %rb0}, [in_b + 16];
add.cc.u64 %ra0, %ra0, %rb0;
addc.cc.u64 %ra1, %ra1, %rb1;
addc.cc.u64 %ra2, %ra2, %rb2;
addc.u64 %ra3, %ra3, %rb3;
st.v2.u64 [%rdc], {%ra3, %ra2};
st.v2.u64 [%rdc + 16], {%ra1, %ra0};
ret;
}
.visible .func sub_u32(
.param .b64 out_c,
.param .align 16 .b8 in_a[32],
.param .align 16 .b8 in_b[32]
) {
.reg .u64 %ra<4>, %rb<4>;
.reg .b64 %rdc;
ld.param.b64 %rdc, [out_c];
ld.param.v2.u64 {%ra3, %ra2}, [in_a];
ld.param.v2.u64 {%ra1, %ra0}, [in_a + 16];
ld.param.v2.u64 {%rb3, %rb2}, [in_b];
ld.param.v2.u64 {%rb1, %rb0}, [in_b + 16];
sub.cc.u64 %ra0, %ra0, %rb0;
subc.cc.u64 %ra1, %ra1, %rb1;
subc.cc.u64 %ra2, %ra2, %rb2;
subc.u64 %ra3, %ra3, %rb3;
st.v2.u64 [%rdc], {%ra3, %ra2};
st.v2.u64 [%rdc + 16], {%ra1, %ra0};
ret;
}
.visible .func mul_lo_u16(
.param .b64 out_c,
.param .align 16 .b8 in_a[16],
.param .align 16 .b8 in_b[16]
) {
.reg .u64 %a, %b, %c, %d, %a_b, %c_d;
.reg .u64 %ac, %bd_hi, %bd_lo, %p;
.reg .b64 %rdc;
ld.param.b64 %rdc, [out_c];
ld.param.v2.u64 {%a, %b}, [in_a];
ld.param.v2.u64 {%c, %d}, [in_b];
mul.lo.u64 %ac, %a, %c;
mul.lo.u64 %bd_lo, %b, %d;
mul.hi.u64 %bd_hi, %b, %d;
add.u64 %a_b, %a, %b;
add.u64 %c_d, %c, %d;
mul.lo.u64 %p, %a_b, %c_d;
sub.u64 %p, %p, %ac;
sub.u64 %p, %p, %bd_lo;
add.u64 %p, %p, %bd_hi;
st.v2.u64 [%rdc], {%p, %bd_lo};
ret;
}
.visible .func mul_u16(
.param .b64 out_c_hi,
.param .b64 out_c_lo,
.param .align 16 .b8 in_a[16],
.param .align 16 .b8 in_b[16]
) {
.reg .u64 %a, %b, %c, %d;
.reg .u64 %a_b_hi, %a_b_lo, %c_d_hi, %c_d_lo;
.reg .u64 %p_hi, %p_lo, %p_hi2, %p_lo2;
.reg .u64 %ac_hi, %ac_lo, %bd_hi, %bd_lo;
.reg .b64 %rdc_hi, %rdc_lo;
ld.param.b64 %rdc_hi, [out_c_hi];
ld.param.b64 %rdc_lo, [out_c_lo];
ld.param.v2.u64 {%a, %b}, [in_a];
ld.param.v2.u64 {%c, %d}, [in_b];
mul.lo.u64 %ac_lo, %a, %c;
mul.hi.u64 %ac_hi, %a, %c;
mul.lo.u64 %bd_lo, %b, %d;
mul.hi.u64 %bd_hi, %b, %d;
add.cc.u64 %a_b_lo, %a, %b;
addc.u64 %a_b_hi, %a, %b;
add.cc.u64 %c_d_lo, %c, %d;
addc.u64 %c_d_hi, %c, %d;
mul.lo.u64 %p_lo, %a_b_lo, %c_d_lo;
mul.hi.u64 %p_hi, %a_b_lo, %c_d_lo;
mul.lo.u64 %p_hi2, %a_b_hi, %c_d_hi;
st.v2.u64 [%rdc_lo], {%p_hi, %p_lo};
st.v2.u64 [%rdc_hi], {%a_b_lo, %p_hi2};
ret;
}

View File

@ -0,0 +1,205 @@
#include <stdio.h>
#include <stdint.h>
extern "C" __device__ void add_u16(
ulonglong2 *out_c,
ulonglong2 in_a,
ulonglong2 in_b
);
extern "C" __device__ void sub_u16(
ulonglong2 *out_c,
ulonglong2 in_a,
ulonglong2 in_b
);
extern "C" __device__ void add_u32(
ulonglong4 *out_c,
ulonglong4 in_a,
ulonglong4 in_b
);
extern "C" __device__ void sub_u32(
ulonglong4 *out_c,
ulonglong4 in_a,
ulonglong4 in_b
);
extern "C" __device__ void mul_lo_u16(
ulonglong2 *out_c,
ulonglong2 in_a,
ulonglong2 in_b
);
extern "C" __device__ void mul_u16(
ulonglong2 *out_c_hi,
ulonglong2 *out_c_lo,
ulonglong2 in_a,
ulonglong2 in_b
);
__device__ bool equ_u16(ulonglong2 a, ulonglong2 b) {
return a.x == b.x && a.y == b.y;
}
__device__ bool equ_u32(ulonglong4 a, ulonglong4 b) {
return a.x == b.x &&
a.y == b.y &&
a.z == b.z &&
a.w == b.w;
}
__device__ int cmp_u32(ulonglong4 a, ulonglong4 b) {
if (a.x < b.x)
return -1;
else if (a.x > b.x)
return 1;
if (a.y < b.y)
return -1;
else if (a.y > b.y)
return 1;
if (a.z < b.z)
return -1;
else if (a.z > b.z)
return 1;
if (a.w < b.w)
return -1;
else if (a.w > b.w)
return 1;
return 0;
}
__device__ void mul_lo_u32(
ulonglong4 *out_c,
ulonglong4 in_a,
ulonglong4 in_b
) {
auto a = (ulonglong2 *)&in_a.x;
auto b = (ulonglong2 *)&in_a.z;
auto c = (ulonglong2 *)&in_b.x;
auto d = (ulonglong2 *)&in_b.z;
ulonglong2 a_b, c_d, ac, bd_hi, bd_lo, p;
mul_lo_u16(&ac, *a, *c);
mul_u16(&bd_hi, &bd_lo, *b, *d);
add_u16(&a_b, *a, *b);
add_u16(&c_d, *c, *d);
mul_lo_u16(&p, a_b, c_d);
sub_u16(&p, p, ac);
sub_u16(&p, p, bd_lo);
add_u16(&p, p, bd_hi);
out_c->x = p.x;
out_c->y = p.y;
out_c->z = bd_lo.x;
out_c->w = bd_lo.y;
}
__device__ void print_u16(ulonglong2 a) {
printf("0x%016llx.%016llx\n", a.x, a.y);
}
__device__ void print_u32(ulonglong4 a) {
printf("0x%016llx.%016llx.%016llx.%016llx\n", a.x, a.y, a.z, a.w);
}
#define U8_MAX 0xFFFFFFFFFFFFFFFF
#define U16_MAX {U8_MAX, U8_MAX}
#define U32_MAX {U8_MAX, U8_MAX, U8_MAX, U8_MAX}
__global__ void test(bool *passed) {
*passed = true;
{
ulonglong4 a = U32_MAX;
ulonglong4 b = {0, 0, 0, 1};
ulonglong4 c = {0, 0, 0, 0};
add_u32(&a, a, b);
if (!equ_u32(a, c)) {
printf("add_u32\n");
print_u32(a);
*passed = false;
}
}
{
ulonglong4 a = {0, 0, 0, 0};
ulonglong4 b = {0, 0, 0, 1};
ulonglong4 c = U32_MAX;
sub_u32(&a, a, b);
if (!equ_u32(a, c)) {
printf("sub_u32\n");
print_u32(a);
*passed = false;
}
}
{
ulonglong2 a = U16_MAX;
ulonglong2 b = {0, U8_MAX};
ulonglong2 c = {U8_MAX, 1};
mul_lo_u16(&a, a, b);
if (!equ_u16(a, c)) {
printf("mul_lo_u16\n");
print_u16(a);
*passed = false;
}
}
{
ulonglong2 a = U16_MAX;
ulonglong2 b = {0, U8_MAX};
ulonglong2 c_hi = {0, U8_MAX - 1};
ulonglong2 c_lo = {U8_MAX, 1};
mul_u16(&a, &b, a, b);
if (!equ_u16(a, c_hi) || !equ_u16(b, c_lo)) {
printf("mul_u16\n");
print_u16(a);
print_u16(b);
*passed = false;
}
a = U16_MAX;
b = U16_MAX;
c_hi = {U8_MAX, U8_MAX - 1};
c_lo = {0, 1};
mul_u16(&a, &b, a, b);
if (!equ_u16(a, c_hi) || !equ_u16(b, c_lo)) {
printf("mul_u16\n");
print_u16(a);
print_u16(b);
*passed = false;
}
}
{
ulonglong4 a = U32_MAX;
ulonglong4 b = {0, 0, U8_MAX, U8_MAX};
ulonglong4 c = {U8_MAX, U8_MAX, 0, 1};
mul_lo_u32(&a, a, b);
if (!equ_u32(a, c)) {
printf("mul_lo_u32\n");
print_u32(a);
*passed = false;
}
}
}
int main() {
bool test_passed, *d_test_passed;
cudaMalloc(&d_test_passed, sizeof(bool));
test<<<1, 1>>>(d_test_passed);
cudaDeviceSynchronize();
cudaMemcpy(&test_passed, d_test_passed, sizeof(bool), cudaMemcpyDeviceToHost);
cudaFree(d_test_passed);
if (!test_passed) {
printf("test not passed\n");
return 1;
}
return 0;
}

22
5/data science/1e/test.py Normal file
View File

@ -0,0 +1,22 @@
U8_MAX = 0xFFFFFFFFFFFFFFFF
U16_MAX = U8_MAX << 64 | U8_MAX
U32_MAX = U16_MAX << 128 | U16_MAX
def dothex(num):
strhex = hex(num)[2:]
dothex = strhex[-16:]
strhex = strhex[:-16]
while len(strhex) > 0:
dothex = strhex[-16:] + '.' + dothex
strhex = strhex[:-16]
return '0x' + dothex
print('mul_u16', dothex((U16_MAX * U8_MAX >> 128) %
(U16_MAX + 1)), dothex(U16_MAX * U8_MAX % (U16_MAX + 1)))
print('mul_u16', dothex((U16_MAX * U16_MAX >> 128) %
(U16_MAX + 1)), dothex(U16_MAX * U16_MAX % (U16_MAX + 1)))
print('mul_lo_u32', dothex(U32_MAX * U16_MAX % (U32_MAX + 1)))
print('div_lo_u32', dothex(U32_MAX // U8_MAX), dothex(U32_MAX - U32_MAX // U8_MAX))

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,958 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "def31b0f-921a-43eb-9807-8b9b31eb7b32",
"metadata": {},
"source": [
"<img src=\"./images/DLI_Header.png\" width=400/>"
]
},
{
"cell_type": "markdown",
"id": "4a0fd4dd-f7be-4c90-8ddd-384a760ac04f",
"metadata": {},
"source": [
"# Fundamentals of Accelerated Data Science # "
]
},
{
"cell_type": "markdown",
"id": "6a8fdf2e-a481-455e-8a52-8be8472b63bf",
"metadata": {},
"source": [
"## 03 - Memory Management ##\n",
"\n",
"**Table of Contents**\n",
"<br>\n",
"This notebook explores the dynamics between data and memory. This notebook covers the below sections: \n",
"1. [Memory Management](#Memory-Management)\n",
" * [Memory Usage](#Memory-Usage)\n",
"2. [Data Types](#Data-Types)\n",
" * [Convert Data Types](#Convert-Data-Types)\n",
" * [Exercise #1 - Modify `dtypes`](#Exercise-#1---Modify-dtypes)\n",
" * [Categorical](#Categorical)\n",
"3. [Efficient Data Loading](#Efficient-Data-Loading)"
]
},
{
"cell_type": "markdown",
"id": "1b59367c-48bc-4c72-b1f4-4cfdfa5470cf",
"metadata": {},
"source": [
"## Memory Management ##\n",
"During the data acquisition process, data is transferred to memory in order to be operated on by the processor. Memory management is crucial for cuDF and GPU operations for several key reasons: \n",
"* **Limited GPU memory**: GPUs typically have less memory than CPUs, therefore efficient memory management is essential to maximize the use of available GPU memory, especially for large datasets.\n",
"* **Data transfer overhead**: Transferring data between CPU and GPU memory is relatively slow compared to GPU computation speed. Minimizing these transfers through smart memory management is critical for performance.\n",
"* **Performance tuning**: Understanding and optimizing memory usage is key to achieving peak performance in GPU-accelerated data processing tasks.\n",
"\n",
"When done correctly, keeping the data on the GPU can enable cuDF and the RAPIDS ecosystem to achieve significant performance improvements, handle larger datasets, and provide more efficient data processing capabilities. \n",
"\n",
"Below we import the data from the csv file. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b7b8a623-f799-4dad-aca9-0e571bb6e527",
"metadata": {},
"outputs": [],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"import pandas as pd\n",
"import random\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "711d0a7f-8598-49fc-949c-5caf6029ce47",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>age</th>\n",
" <th>sex</th>\n",
" <th>county</th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>name</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>m</td>\n",
" <td>DARLINGTON</td>\n",
" <td>54.533644</td>\n",
" <td>-1.524401</td>\n",
" <td>FRANCIS</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>0</td>\n",
" <td>m</td>\n",
" <td>DARLINGTON</td>\n",
" <td>54.426256</td>\n",
" <td>-1.465314</td>\n",
" <td>EDWARD</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>0</td>\n",
" <td>m</td>\n",
" <td>DARLINGTON</td>\n",
" <td>54.555200</td>\n",
" <td>-1.496417</td>\n",
" <td>TEDDY</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>0</td>\n",
" <td>m</td>\n",
" <td>DARLINGTON</td>\n",
" <td>54.547906</td>\n",
" <td>-1.572341</td>\n",
" <td>ANGUS</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>0</td>\n",
" <td>m</td>\n",
" <td>DARLINGTON</td>\n",
" <td>54.477639</td>\n",
" <td>-1.605995</td>\n",
" <td>CHARLIE</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" age sex county lat long name\n",
"0 0 m DARLINGTON 54.533644 -1.524401 FRANCIS\n",
"1 0 m DARLINGTON 54.426256 -1.465314 EDWARD\n",
"2 0 m DARLINGTON 54.555200 -1.496417 TEDDY\n",
"3 0 m DARLINGTON 54.547906 -1.572341 ANGUS\n",
"4 0 m DARLINGTON 54.477639 -1.605995 CHARLIE"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"df=pd.read_csv('./data/uk_pop.csv')\n",
"\n",
"# preview\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"id": "36416fd0-7081-42aa-bf31-d1231b81ec0b",
"metadata": {},
"source": [
"### Memory Usage ###\n",
"Memory utilization of a DataFrame depends on the date types for each column.\n",
"\n",
"<p><img src='images/dtypes.png' width=720></p>\n",
"\n",
"We can use `DataFrame.memory_usage()` to see the memory usage for each column (in bytes). Most of the common data types have a fixed size in memory, such as `int`, `float`, `datetime`, and `bool`. Memory usage for these data types is the respective memory requirement multiplied by the number of data points. For `string` data type, the memory usage reported _for pandas_ is the number of elements times 8 bytes. This accounts for the 64-bit required for the pointer that points to an address in memory but not the memory used for the actual string values. The actual memory required for a string value is 49 bytes plus an additional byte for each character. The `deep` parameter provides a more accurate memory usage report that accounts for the system-level memory consumption of the contained `string` data type. \n",
"\n",
"Below we get the memory usage. "
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "8378207b-2d9e-4102-8408-c2dddafc8a40",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"Index 128\n",
"age 467839152\n",
"sex 3391833852\n",
"county 3934985133\n",
"lat 467839152\n",
"long 467839152\n",
"name 3666922374\n",
"dtype: int64"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"# pandas memory utilization\n",
"mem_usage_df=df.memory_usage(deep=True)\n",
"mem_usage_df"
]
},
{
"cell_type": "markdown",
"id": "07c24bb1-c4f7-440c-a949-d4c57800ec61",
"metadata": {},
"source": [
"Below we define a `make_decimal()` function to convert memory size into units based on powers of 2. In contrast to units based on powers of 10, this customary convention is commonly used to report memory capacity. More information about the two definitions can be found [here](https://en.wikipedia.org/wiki/Byte#Multiple-byte_units). "
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "5ae42218-1547-49fd-9123-ab508a2b03de",
"metadata": {},
"outputs": [],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"suffixes = ['B', 'kB', 'MB', 'GB', 'TB', 'PB']\n",
"def make_decimal(nbytes):\n",
" i=0\n",
" while nbytes >= 1024 and i < len(suffixes)-1:\n",
" nbytes/=1024.\n",
" i+=1\n",
" f=('%.2f' % nbytes).rstrip('0').rstrip('.')\n",
" return '%s %s' % (f, suffixes[i])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "e6d4a613-3eea-4dce-8e71-39593ff6f226",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"'11.55 GB'"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"make_decimal(mem_usage_df.sum())"
]
},
{
"cell_type": "markdown",
"id": "a352c0b2-65aa-4231-b753-556aca46ff49",
"metadata": {},
"source": [
"Below we calculate the memory usage manually based on the data types. "
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "630327b9-6dc1-4b70-9fdf-9f7763ec4d50",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Numerical columns use 467839152 bytes of memory\n"
]
}
],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"# get number of rows\n",
"num_rows=len(df)\n",
"\n",
"# 64-bit numbers uses 8 bytes of memory\n",
"print(f'Numerical columns use {num_rows*8} bytes of memory')"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "bb22b5f4-e38f-438e-9426-61746b509e50",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"county column uses 3934985133 bytes of memory.\n"
]
}
],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"# check random string-typed column\n",
"string_cols=[col for col in df.columns if df[col].dtype=='object' ]\n",
"column_to_check=random.choice(string_cols)\n",
"\n",
"overhead=49\n",
"pointer_size=8\n",
"\n",
"# nan==nan when value is not a number\n",
"# nan uses 32 bytes of memory\n",
"string_col_mem_usage_df=df[column_to_check].map(lambda x: len(x)+overhead+pointer_size if x else 32)\n",
"string_col_mem_usage=string_col_mem_usage_df.sum()\n",
"print(f'{column_to_check} column uses {string_col_mem_usage} bytes of memory.')"
]
},
{
"cell_type": "markdown",
"id": "94e393c2-c0d0-40ee-82d2-730c4667e9b8",
"metadata": {},
"source": [
"**Note**: The `string` data type is stored differently in cuDF than it is in pandas. More information about `libcudf` stores string data using the [Arrow format](https://arrow.apache.org/docs/format/Columnar.html#variable-size-binary-layout) can be found [here](https://developer.nvidia.com/blog/mastering-string-transformations-in-rapids-libcudf/). "
]
},
{
"cell_type": "markdown",
"id": "737ff50b-9426-4e08-a00a-d7ee69f48b9f",
"metadata": {},
"source": [
"## Data Types ##\n",
"By default, pandas (and cuDF) uses 64-bit for numerical values. Using 64-bit numbers provides the highest precision but many applications do not require 64-bit precision when aggregating over a very large number of data points. When possible, using 32-bit numbers reduces storage and memory requirements in half, and also typically greatly speeds up computations because only half as much data needs to be accessed in memory. "
]
},
{
"cell_type": "markdown",
"id": "0b77d450-c415-44b8-87ac-20ce616ec809",
"metadata": {},
"source": [
"### Convert Data Types ###\n",
"The `.astype()` method can be used to convert numerical data types to use different bit-size containers. Here we convert the `age` column from `int64` to `int8`. "
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "603f7c70-134e-4466-a790-8a18b9088ca6",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"age int8\n",
"sex object\n",
"county object\n",
"lat float64\n",
"long float64\n",
"name object\n",
"dtype: object"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"df['age']=df['age'].astype('int8')\n",
"\n",
"df.dtypes"
]
},
{
"cell_type": "markdown",
"id": "973a6dd4-2aef-44d9-8b01-8853032eddae",
"metadata": {},
"source": [
"### Exercise #1 - Modify `dtypes` ###\n",
"**Instructions**: <br>\n",
"* Modify the `<FIXME>` only and execute the below cell to convert any 64-bit data types to their 32-bit counterparts."
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "beb7d71b-6672-462e-b65c-a64dbe5f7a57",
"metadata": {},
"outputs": [],
"source": [
"df['lat']=df['lat'].astype('float32')\n",
"df['long']=df['long'].astype('float32')"
]
},
{
"cell_type": "raw",
"id": "3b44fb22-a0f1-4e43-a332-1ccbad50caee",
"metadata": {},
"source": [
"\n",
"df['lat']=df['lat'].astype('float32')\n",
"df['long']=df['long'].astype('float32')"
]
},
{
"cell_type": "markdown",
"id": "98b6542d-22cc-4926-b600-a3e052c37c96",
"metadata": {},
"source": [
"Click ... for solution. "
]
},
{
"cell_type": "markdown",
"id": "7b2cd622-977c-4915-a87f-2fe03c1793f5",
"metadata": {},
"source": [
"### Categorical ###\n",
"Categorical data is a type of data that represents discrete, distinct categories or groups. They can have a meaningful order or ranking but generally cannot be used for numerical operations. When appropriate, using the `categorical` data type can reduce memory usage and lead to faster operations. It can also be used to define and maintain a custom order of categories. \n",
"\n",
"Below we get the number of unique values in the string columns. "
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "f249e4b8-5d7a-4b44-ac15-bd3360a43f2a",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"sex 2\n",
"county 171\n",
"name 13212\n",
"dtype: int64"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"df.select_dtypes(include='object').nunique()"
]
},
{
"cell_type": "markdown",
"id": "f1d8bd88-b39b-4043-9039-d8bd75fe851a",
"metadata": {},
"source": [
"Below we convert columns with few discrete values to `category`. The `category` data type has `.categories` and `codes` properties that are accessed through `.cat`. "
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "a99bebbf-2e5b-4720-96f9-9fd7d42d2fe8",
"metadata": {},
"outputs": [],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"df['sex']=df['sex'].astype('category')\n",
"df['county']=df['county'].astype('category')"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "41b7b290-cfcf-4ff6-b6b4-454c19b44a62",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"Index(['BARKING AND DAGENHAM', 'BARNET', 'BARNSLEY',\n",
" 'BATH AND NORTH EAST SOMERSET', 'BEDFORD', 'BEXLEY', 'BIRMINGHAM',\n",
" 'BLACKBURN WITH DARWEN', 'BLACKPOOL', 'BLAENAU GWENT',\n",
" ...\n",
" 'WESTMINSTER', 'WIGAN', 'WILTSHIRE', 'WINDSOR AND MAIDENHEAD', 'WIRRAL',\n",
" 'WOKINGHAM', 'WOLVERHAMPTON', 'WORCESTERSHIRE', 'WREXHAM', 'YORK'],\n",
" dtype='object', length=171)"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"----------------------------------------\n"
]
},
{
"data": {
"text/plain": [
"0 37\n",
"1 37\n",
"2 37\n",
"3 37\n",
"4 37\n",
" ..\n",
"58479889 96\n",
"58479890 96\n",
"58479891 96\n",
"58479892 96\n",
"58479893 96\n",
"Length: 58479894, dtype: int16"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"display(df['county'].cat.categories)\n",
"print('-'*40)\n",
"display(df['county'].cat.codes)"
]
},
{
"cell_type": "markdown",
"id": "737385ab-677c-4bef-a86a-10aa3119e29a",
"metadata": {},
"source": [
"**Note**: `.astype()` can also be used to convert data to `datetime` or `object` to enable datetime and string methods. "
]
},
{
"cell_type": "markdown",
"id": "552c47c2-0fbc-455e-8745-cb98fc777243",
"metadata": {},
"source": [
"## Efficient Data Loading ##\n",
"It is often advantageous to specify the most appropriate data types for each columns, based on range, precision requirement, and how they are used. "
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "c2b9f0c3-8598-4a28-9481-ce28fea7544b",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"Index 128\n",
"age 467839152\n",
"sex 3391833852\n",
"county 3934985133\n",
"lat 467839152\n",
"long 467839152\n",
"name 3666922374\n",
"dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading 11.55 GB took 33.63 seconds.\n"
]
}
],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"start=time.time()\n",
"df=pd.read_csv('./data/uk_pop.csv')\n",
"duration=time.time()-start\n",
"\n",
"mem_usage_df=df.memory_usage(deep=True)\n",
"display(mem_usage_df)\n",
"\n",
"print(f'Loading {make_decimal(mem_usage_df.sum())} took {round(duration, 2)} seconds.')"
]
},
{
"cell_type": "markdown",
"id": "5729520e-3ed8-4ec6-ae1f-ba46d642f48d",
"metadata": {},
"source": [
"Below we enable `cuda.pandas` to see the difference. "
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "99aa0f32-4d2a-43a7-bec1-f1b88bcc37c2",
"metadata": {},
"outputs": [],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"%load_ext cudf.pandas\n",
"\n",
"import pandas as pd\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 16,
"id": "2b724201-9ad1-4e9b-b712-f3b31bdc4104",
"metadata": {},
"outputs": [],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"suffixes = ['B', 'kB', 'MB', 'GB', 'TB', 'PB']\n",
"def make_decimal(nbytes):\n",
" i=0\n",
" while nbytes >= 1024 and i < len(suffixes)-1:\n",
" nbytes/=1024.\n",
" i+=1\n",
" f=('%.2f' % nbytes).rstrip('0').rstrip('.')\n",
" return '%s %s' % (f, suffixes[i])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "99bdd7b0-8563-41db-bd8e-3a7279394ede",
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/plain": [
"age 58479894\n",
"sex 58479908\n",
"county 58482446\n",
"lat 467839152\n",
"long 467839152\n",
"name 117096917\n",
"Index 0\n",
"dtype: int64"
]
},
"metadata": {},
"output_type": "display_data"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Loading 1.14 GB took 2.13 seconds.\n"
]
},
{
"data": {
"text/html": [
"<pre style=\"white-space:pre;overflow-x:auto;line-height:normal;font-family:Menlo,'DejaVu Sans Mono',consolas,'Courier New',monospace\"><span style=\"font-style: italic\"> </span>\n",
"<span style=\"font-style: italic\"> Total time elapsed: 2.705 seconds </span>\n",
"<span style=\"font-style: italic\"> </span>\n",
"<span style=\"font-style: italic\"> Stats </span>\n",
"<span style=\"font-style: italic\"> </span>\n",
"┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n",
"┃<span style=\"font-weight: bold\"> Line no. </span>┃<span style=\"font-weight: bold\"> Line </span>┃<span style=\"font-weight: bold\"> GPU TIME(s) </span>┃<span style=\"font-weight: bold\"> CPU TIME(s) </span>┃\n",
"┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n",
"│ 2 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> start</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">=</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">time</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">.</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">time()</span><span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ 5 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> dtype_dict</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">=</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">{</span><span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ 6 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'age'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">: </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'int8'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">, </span><span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ 7 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'sex'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">: </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'category'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">, </span><span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ 8 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'county'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">: </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'category'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">, </span><span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ 9 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'lat'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">: </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'float64'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">, </span><span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ 10 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'long'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">: </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'float64'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">, </span><span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ 11 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'name'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">: </span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'category'</span><span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ 14 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> efficient_df</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">=</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">pd</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">.</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">read_csv(</span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'./data/uk_pop.csv'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">, dtype</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">=</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">dtype_dict)</span><span style=\"background-color: #272822\"> </span> │ 1.728013188 │ │\n",
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ 15 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> duration</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">=</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">time</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">.</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">time()</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">-</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">start</span><span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ 17 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> mem_usage_df</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">=</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">efficient_df</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">.</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">memory_usage(</span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">'deep'</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">)</span><span style=\"background-color: #272822\"> </span> │ 0.005340174 │ │\n",
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ 18 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> display(mem_usage_df)</span><span style=\"background-color: #272822\"> </span> │ 0.011073721 │ 0.006896915 │\n",
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
"│ 20 │ <span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\"> print(</span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">f'Loading {</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">make_decimal(mem_usage_df</span><span style=\"color: #ff4689; text-decoration-color: #ff4689; background-color: #272822\">.</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">sum())</span><span style=\"color: #e6db74; text-decoration-color: #e6db74; background-color: #272822\">} took {</span><span style=\"color: #f8f8f2; text-decoration-color: #f8f8f2; background-color: #272822\">round(dura…</span> │ 0.004693074 │ │\n",
"│ │ <span style=\"background-color: #272822\"> </span> │ │ │\n",
"└──────────┴──────────────────────────────────────────────────────────────────────────┴─────────────┴─────────────┘\n",
"</pre>\n"
],
"text/plain": [
"\u001b[3m \u001b[0m\n",
"\u001b[3m Total time elapsed: 2.705 seconds \u001b[0m\n",
"\u001b[3m \u001b[0m\n",
"\u001b[3m Stats \u001b[0m\n",
"\u001b[3m \u001b[0m\n",
"┏━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━┳━━━━━━━━━━━━━┓\n",
"┃\u001b[1m \u001b[0m\u001b[1mLine no.\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mLine \u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mGPU TIME(s)\u001b[0m\u001b[1m \u001b[0m┃\u001b[1m \u001b[0m\u001b[1mCPU TIME(s)\u001b[0m\u001b[1m \u001b[0m┃\n",
"┡━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━╇━━━━━━━━━━━━━┩\n",
"│ 2 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mstart\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m=\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mtime\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m.\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mtime\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m)\u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ 5 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mdtype_dict\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m=\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m{\u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ 6 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mage\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m:\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mint8\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m,\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ 7 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34msex\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m:\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mcategory\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m,\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ 8 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mcounty\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m:\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mcategory\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m,\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ 9 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mlat\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m:\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mfloat64\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m,\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ 10 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mlong\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m:\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mfloat64\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m,\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ 11 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mname\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m:\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mcategory\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ 14 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mefficient_df\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m=\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mpd\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m.\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mread_csv\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m./data/uk_pop.csv\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m,\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mdtype\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m=\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mdtype_dict\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m)\u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ 1.728013188 │ │\n",
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ 15 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mduration\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m=\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mtime\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m.\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mtime\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m)\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m-\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mstart\u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ 17 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mmem_usage_df\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m=\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mefficient_df\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m.\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mmemory_usage\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mdeep\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m)\u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ 0.005340174 │ │\n",
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ 18 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mdisplay\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mmem_usage_df\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m)\u001b[0m\u001b[48;2;39;40;34m \u001b[0m │ 0.011073721 │ 0.006896915 │\n",
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"│ 20 │ \u001b[38;2;248;248;242;48;2;39;40;34m \u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mprint\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mf\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m'\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34mLoading \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m{\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mmake_decimal\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mmem_usage_df\u001b[0m\u001b[38;2;255;70;137;48;2;39;40;34m.\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34msum\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m)\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m)\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m}\u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m took \u001b[0m\u001b[38;2;230;219;116;48;2;39;40;34m{\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mround\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34m(\u001b[0m\u001b[38;2;248;248;242;48;2;39;40;34mdura…\u001b[0m │ 0.004693074 │ │\n",
"│ │ \u001b[48;2;39;40;34m \u001b[0m │ │ │\n",
"└──────────┴──────────────────────────────────────────────────────────────────────────┴─────────────┴─────────────┘\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%%cudf.pandas.line_profile\n",
"# DO NOT CHANGE THIS CELL\n",
"start=time.time()\n",
"\n",
"# define data types for each column\n",
"dtype_dict={\n",
" 'age': 'int8', \n",
" 'sex': 'category', \n",
" 'county': 'category', \n",
" 'lat': 'float64', \n",
" 'long': 'float64', \n",
" 'name': 'category'\n",
"}\n",
" \n",
"efficient_df=pd.read_csv('./data/uk_pop.csv', dtype=dtype_dict)\n",
"duration=time.time()-start\n",
"\n",
"mem_usage_df=efficient_df.memory_usage('deep')\n",
"display(mem_usage_df)\n",
"\n",
"print(f'Loading {make_decimal(mem_usage_df.sum())} took {round(duration, 2)} seconds.')"
]
},
{
"cell_type": "markdown",
"id": "0f4607d8-6de3-4b27-96d4-a9720d268333",
"metadata": {},
"source": [
"We were able to load data faster and more efficiently. \n",
"\n",
"**Note**: Notice that the memory utilized on the GPU is larger than the memory used by the DataFrame. This is expected because there are intermediary processes that use some memory during the data loading process, specifically related to parsing the csv file in this case. \n",
"\n",
"```\n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 525.60.13 Driver Version: 525.60.13 CUDA Version: 12.0 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla T4 Off | 00000000:00:1B.0 Off | 0 |\n",
"| N/A 32C P0 26W / 70W | 1378MiB / 15360MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
"| 1 Tesla T4 Off | 00000000:00:1C.0 Off | 0 |\n",
"| N/A 31C P0 26W / 70W | 168MiB / 15360MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
"| 2 Tesla T4 Off | 00000000:00:1D.0 Off | 0 |\n",
"| N/A 30C P0 26W / 70W | 168MiB / 15360MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
"| 3 Tesla T4 Off | 00000000:00:1E.0 Off | 0 |\n",
"| N/A 30C P0 26W / 70W | 168MiB / 15360MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"+-----------------------------------------------------------------------------+\n",
"```"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "92f7ee37-4acb-46aa-bb73-4c0139d3f6b8",
"metadata": {
"scrolled": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Tue Oct 21 08:08:25 2025 \n",
"+-----------------------------------------------------------------------------+\n",
"| NVIDIA-SMI 525.85.12 Driver Version: 525.85.12 CUDA Version: 12.0 |\n",
"|-------------------------------+----------------------+----------------------+\n",
"| GPU Name Persistence-M| Bus-Id Disp.A | Volatile Uncorr. ECC |\n",
"| Fan Temp Perf Pwr:Usage/Cap| Memory-Usage | GPU-Util Compute M. |\n",
"| | | MIG M. |\n",
"|===============================+======================+======================|\n",
"| 0 Tesla T4 On | 00000000:00:1B.0 Off | 0 |\n",
"| N/A 28C P0 24W / 70W | 11314MiB / 15360MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
"| 1 Tesla T4 On | 00000000:00:1C.0 Off | 0 |\n",
"| N/A 29C P0 25W / 70W | 168MiB / 15360MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
"| 2 Tesla T4 On | 00000000:00:1D.0 Off | 0 |\n",
"| N/A 28C P0 25W / 70W | 168MiB / 15360MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
"| 3 Tesla T4 On | 00000000:00:1E.0 Off | 0 |\n",
"| N/A 29C P0 24W / 70W | 168MiB / 15360MiB | 0% Default |\n",
"| | | N/A |\n",
"+-------------------------------+----------------------+----------------------+\n",
" \n",
"+-----------------------------------------------------------------------------+\n",
"| Processes: |\n",
"| GPU GI CI PID Type Process name GPU Memory |\n",
"| ID ID Usage |\n",
"|=============================================================================|\n",
"+-----------------------------------------------------------------------------+\n"
]
}
],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"!nvidia-smi"
]
},
{
"cell_type": "markdown",
"id": "c031d2c7-03cb-4ac7-a195-70fc25cb191d",
"metadata": {},
"source": [
"When loading data this way, we may be able to fit more data. The optimal dataset size depends on various factors including the specific operations being performed, the complexity of the workload, and the available GPU memory. To maximize acceleration, datasets should ideally fit within GPU memory, with ample space left for operations that can spike memory requirements. As a general rule of thumb, cuDF recommends data sets that are less than 50% of the GPU memory capacity. "
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ec6cefea-dc64-4f13-815e-081cd35651b9",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"# 1 gigabytes = 1073741824 bytes\n",
"mem_capacity=16*1073741824\n",
"\n",
"mem_per_record=mem_usage_df.sum()/len(efficient_df)\n",
"\n",
"print(f'We can load {int(mem_capacity/2/mem_per_record)} rows.')"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "ddaaa1ac-66ec-4323-9842-2543c6d85e4e",
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"import IPython\n",
"app = IPython.Application.instance()\n",
"app.kernel.do_shutdown(True)"
]
},
{
"cell_type": "markdown",
"id": "658e9847-775f-4d12-af4e-8f896df4e6fe",
"metadata": {},
"source": [
"**Well Done!** Let's move to the [next notebook](1-04_interoperability.ipynb). "
]
},
{
"cell_type": "markdown",
"id": "b86451cf-60e6-4733-b431-1bc0bd586bc2",
"metadata": {},
"source": [
"<img src=\"./images/DLI_Header.png\" width=400/>"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,669 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "2d190a78-7253-4fad-9d9c-6b4fb33c8bf2",
"metadata": {
"tags": []
},
"source": [
"<img src=\"./images/DLI_Header.png\" width=400/>"
]
},
{
"cell_type": "markdown",
"id": "8a2c4abf-6278-4edd-83f8-f0afac4c834f",
"metadata": {},
"source": [
"# Fundamentals of Accelerated Data Science #"
]
},
{
"cell_type": "markdown",
"id": "e1e78ef4-c0de-433e-8616-bd946f69d30e",
"metadata": {},
"source": [
"## 04 - cuGraph as a NetworkX backend ##"
]
},
{
"cell_type": "markdown",
"id": "0828e0b4-7935-4b77-95ef-e06b72f0319e",
"metadata": {},
"source": [
"**Table of Contents**\n",
"<br>\n",
"This notebook introduces the various methods of utilizing the cuGraph backend for NetworkX and runs centrality algorithms on the dataset. This notebook covers the below sections:\n",
"1. [Background](#Background)\n",
"2. [Installation](#Installation)\n",
"3. [Utilizing nx-cugraph](#Utilizing-nx-cugraph)\n",
" * [Runtime Environment Variable](#Runtime-Environment-Variable)\n",
" * [Backend Keyword Argument](#Backend-Keyword-Argument)\n",
" * [Type-Based Dispatching](#Type-Based-Dispatching)\n",
"4. [Computing Centrality](#Computing-Centrality)\n",
" * [Creating Graph](#Creating-Graph)\n",
" * [Running Centrality Algorithms](#Running-Centrality-Algorithms)\n",
" * [Betweenness Centrality](#Betweenness-Centrality)\n",
" * [Degree Centrality](#Degree-Centrality)\n",
" * [Katz Centrality](#Katz-Centrality)\n",
" * [Pagerank Centrality](#Pagerank-Centrality)\n",
" * [Eigenvector Centrality](#Eigenvector-Centrality)\n",
" * [Visualize Results](#Visualize-Results)\n",
" * [Exercise #1 - Type Dispatch](#Exercise-#1---Type-Dispatch)"
]
},
{
"cell_type": "markdown",
"id": "c57b79ba-c7c7-49d2-9e21-c388bbe6ca98",
"metadata": {},
"source": [
"## Background ##\n",
"RAPIDS recently introduced a new backend to NetworkX called nx-cugraph. With this backend, you can automatically accelerate supported algorithms. In this notebook, we will cover the various methods of enabling the cugraph backend, and use the backend to run different centrality algorithms."
]
},
{
"cell_type": "markdown",
"id": "697ea4c9-b416-43d5-9d2c-28aa41ef2561",
"metadata": {},
"source": [
"## Installation ##\n",
"We have already prepared the environment with nx-cugraph installed. When you are using your own environment, below is the command for installation. "
]
},
{
"cell_type": "raw",
"id": "2fe07200-4f66-4604-9950-40ade1938f4c",
"metadata": {},
"source": [
"pip install nx-cugraph-cu12 --no-deps --extra-index-url https://pypi.anaconda.org/rapidsai-wheels-nightly/simple"
]
},
{
"cell_type": "markdown",
"id": "a9ea09f4-6c93-4785-bcc3-44c6f040dfc6",
"metadata": {},
"source": [
"## Utilizing nx-cugraph ##\n",
"There are 3 ways to utilize nx-cugraph\n",
"\n",
"1. **Environment Variable at Runtime**\n",
"2. **Backend keyword argument**\n",
"3. **Type-Based dispatching**\n",
"\n",
"Let's dig a little deeper in to each of these methods."
]
},
{
"cell_type": "markdown",
"id": "8b4322fd-9f56-4cbc-a00c-8fac4b2b2fe1",
"metadata": {},
"source": [
"### Runtime Environment Variable ###\n",
"The NETWORKX_AUTOMATIC_BACKENDS environment variable can be used to have NetworkX automatically dispatch to specified backends. Set NETWORKX_AUTOMATIC_BACKENDS=cugraph to use nx-cugraph to GPU accelerate supported APIs with no code changes. We will also be loading the cuDF pandas module to accelerate csv loading."
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "b41fef7f-5d43-4481-98a7-d9f3cb54066c",
"metadata": {},
"outputs": [],
"source": [
"!NETWORKX_AUTOMATIC_BACKENDS=cugraph python -m cudf.pandas scripts/networkx.py"
]
},
{
"cell_type": "markdown",
"id": "5ffb6c4b-a03a-4bfb-9b92-14c59e6dcd75",
"metadata": {},
"source": [
"### Backend Keyword Argument ###\n",
"NetworkX also supports explicitly specifying a particular backend for supported APIs with the backend= keyword argument. This argument takes precedence over the NETWORKX_AUTOMATIC_BACKENDS environment variable. This method also requires that the specified backend already be installed."
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "8183ecc7-8544-4914-8c07-c904ba12225a",
"metadata": {},
"outputs": [],
"source": [
"import warnings\n",
"warnings.filterwarnings('ignore')\n",
"\n",
"%load_ext cudf.pandas\n",
"import networkx as nx\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"\n",
"# Load the CSV file\n",
"road_graph = pd.read_csv('./data/road_graph.csv', dtype=['int32', 'int32', 'float32'], nrows=1000)\n",
"\n",
"# Create an empty graph\n",
"G = nx.from_pandas_edgelist(road_graph, source='src', target='dst', edge_attr='length')\n",
"b = nx.betweenness_centrality(G, k=1000, backend=\"cugraph\")"
]
},
{
"cell_type": "markdown",
"id": "e588aa65-6281-4c19-a51c-42f044636ac0",
"metadata": {},
"source": [
"### Type-Based Dispatching ###\n",
"For users wanting to ensure a particular behavior, without the potential for runtime conversions, NetworkX offers type-based dispatching. To utilize this method, users must import the desired backend and create a Graph instance for it."
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "5fea9300-8d75-443a-9ec0-ee65c8ccaf0f",
"metadata": {},
"outputs": [],
"source": [
"import networkx as nx\n",
"import nx_cugraph as nxcg\n",
"\n",
"# Loading data from previous cell\n",
"G = nx.from_pandas_edgelist(road_graph, source='src', target='dst', edge_attr='length') \n",
"\n",
"nxcg_G = nxcg.from_networkx(G) # conversion happens once here\n",
"b = nx.betweenness_centrality(nxcg_G, k=1000) # nxcg Graph type causes cugraph backend to be used, no conversion necessary"
]
},
{
"cell_type": "markdown",
"id": "cb5a17e1-d886-4d20-8d4b-ce900280279c",
"metadata": {},
"source": [
"## Computing Centrality ##\n",
"Now that we learned how to enable nx-cugraph, let's try to use it in a workflow! We will be using the backend argument for this example. First let's create a graph."
]
},
{
"cell_type": "markdown",
"id": "19bea37c-bccf-4815-81bd-aa1de553812d",
"metadata": {},
"source": [
"### Creating Graph ###"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "2b4420d7-7c89-4914-809f-4e323a12f47f",
"metadata": {},
"outputs": [],
"source": [
"# Create a graph from already loaded dataframe\n",
"G = nx.from_pandas_edgelist(road_graph, source='src', target='dst', edge_attr='length')"
]
},
{
"cell_type": "markdown",
"id": "7dc1ad5b-8454-4277-9568-0cdacbebd9f1",
"metadata": {},
"source": [
"### Running Centrality Algorithms ###\n",
"Now, let's run the various centrality algorithms!"
]
},
{
"cell_type": "markdown",
"id": "1c52b7b3-6c23-45be-9ace-34a667f132aa",
"metadata": {},
"source": [
"### Betweenness Centrality ###\n",
"Quantifies the number of times a node acts as a bridge along the shortest path between two other nodes, highlighting its importance in information flow"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "281374af-c7cf-4592-a34d-796c1158dab6",
"metadata": {},
"outputs": [],
"source": [
"b = nx.betweenness_centrality(G, backend=\"cugraph\")"
]
},
{
"cell_type": "markdown",
"id": "f98b2975-1f72-4bff-83c7-ace7aab65d98",
"metadata": {},
"source": [
"### Degree Centrality ###\n",
"Measures the number of direct connections a node has, indicating how well-connected it is within the network"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3e0c4460-6d25-4a2b-8b8f-8f8c6ef617b0",
"metadata": {},
"outputs": [],
"source": [
"d = nx.degree_centrality(G, backend=\"cugraph\")"
]
},
{
"cell_type": "markdown",
"id": "0665a659-16b1-48b4-b3bb-9aa5659ef91c",
"metadata": {},
"source": [
"### Katz Centrality ###\n",
"Measures a node's centrality based on its global influence in the network, considering both direct and indirect connections"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "8ce418d2-9eda-40bc-9733-b82d8d7556b1",
"metadata": {},
"outputs": [],
"source": [
"k = nx.katz_centrality(G, backend=\"cugraph\")"
]
},
{
"cell_type": "markdown",
"id": "0712cedb-87ba-4a08-a74d-24997d02a636",
"metadata": {},
"source": [
"### Pagerank Centrality ###\n",
"Determines a node's importance based on the quantity and quality of links to it, similar to Google's original PageRank algorithm"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "a17ee15b-8758-484b-82b9-a158187231c5",
"metadata": {},
"outputs": [],
"source": [
"p = nx.pagerank(G, max_iter=10, tol=1.0e-3, backend=\"cugraph\")"
]
},
{
"cell_type": "markdown",
"id": "c5f57a5e-95e4-47f7-a9ec-04a99fa2c1dc",
"metadata": {},
"source": [
"### Eigenvector Centrality ###\n",
"Assigns scores to nodes based on the principle that connections to high-scoring nodes contribute more to the node's own score than connections to low-scoring nodes"
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "3eb1e358-ae8e-4399-bf45-90616b663e9d",
"metadata": {},
"outputs": [],
"source": [
"e = nx.eigenvector_centrality(G, max_iter=1000, tol=1.0e-3, backend=\"cugraph\")"
]
},
{
"cell_type": "markdown",
"id": "0bc9178c-e66a-4c75-bf91-0c5d668b5634",
"metadata": {},
"source": [
"### Visualize Results ###\n",
"Now let's visualize results! We will only display the top 5 rows for readibility. "
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "69b6c23d-78a0-4dbb-be19-913ad180fe94",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<style type=\"text/css\">\n",
"</style>\n",
"<table id=\"T_9f2bb\" style='display:inline'>\n",
" <caption>Degree</caption>\n",
" <thead>\n",
" <tr>\n",
" <th id=\"T_9f2bb_level0_col0\" class=\"col_heading level0 col0\" >vertex</th>\n",
" <th id=\"T_9f2bb_level0_col1\" class=\"col_heading level0 col1\" >degree_centrality</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td id=\"T_9f2bb_row0_col0\" class=\"data row0 col0\" >24</td>\n",
" <td id=\"T_9f2bb_row0_col1\" class=\"data row0 col1\" >0.002847</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_9f2bb_row1_col0\" class=\"data row1 col0\" >72</td>\n",
" <td id=\"T_9f2bb_row1_col1\" class=\"data row1 col1\" >0.002847</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_9f2bb_row2_col0\" class=\"data row2 col0\" >86</td>\n",
" <td id=\"T_9f2bb_row2_col1\" class=\"data row2 col1\" >0.002847</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_9f2bb_row3_col0\" class=\"data row3 col0\" >127</td>\n",
" <td id=\"T_9f2bb_row3_col1\" class=\"data row3 col1\" >0.002847</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_9f2bb_row4_col0\" class=\"data row4 col0\" >133</td>\n",
" <td id=\"T_9f2bb_row4_col1\" class=\"data row4 col1\" >0.002847</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<style type=\"text/css\">\n",
"</style>\n",
"<table id=\"T_c13b0\" style='display:inline'>\n",
" <caption>Betweenness</caption>\n",
" <thead>\n",
" <tr>\n",
" <th id=\"T_c13b0_level0_col0\" class=\"col_heading level0 col0\" >vertex</th>\n",
" <th id=\"T_c13b0_level0_col1\" class=\"col_heading level0 col1\" >betweenness_centrality</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td id=\"T_c13b0_row0_col0\" class=\"data row0 col0\" >222</td>\n",
" <td id=\"T_c13b0_row0_col1\" class=\"data row0 col1\" >0.000007</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_c13b0_row1_col0\" class=\"data row1 col0\" >381</td>\n",
" <td id=\"T_c13b0_row1_col1\" class=\"data row1 col1\" >0.000007</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_c13b0_row2_col0\" class=\"data row2 col0\" >24</td>\n",
" <td id=\"T_c13b0_row2_col1\" class=\"data row2 col1\" >0.000006</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_c13b0_row3_col0\" class=\"data row3 col0\" >72</td>\n",
" <td id=\"T_c13b0_row3_col1\" class=\"data row3 col1\" >0.000006</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_c13b0_row4_col0\" class=\"data row4 col0\" >86</td>\n",
" <td id=\"T_c13b0_row4_col1\" class=\"data row4 col1\" >0.000006</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<style type=\"text/css\">\n",
"</style>\n",
"<table id=\"T_afb59\" style='display:inline'>\n",
" <caption>Katz</caption>\n",
" <thead>\n",
" <tr>\n",
" <th id=\"T_afb59_level0_col0\" class=\"col_heading level0 col0\" >vertex</th>\n",
" <th id=\"T_afb59_level0_col1\" class=\"col_heading level0 col1\" >katz_centrality</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td id=\"T_afb59_row0_col0\" class=\"data row0 col0\" >24</td>\n",
" <td id=\"T_afb59_row0_col1\" class=\"data row0 col1\" >0.033058</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_afb59_row1_col0\" class=\"data row1 col0\" >72</td>\n",
" <td id=\"T_afb59_row1_col1\" class=\"data row1 col1\" >0.033058</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_afb59_row2_col0\" class=\"data row2 col0\" >86</td>\n",
" <td id=\"T_afb59_row2_col1\" class=\"data row2 col1\" >0.033058</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_afb59_row3_col0\" class=\"data row3 col0\" >127</td>\n",
" <td id=\"T_afb59_row3_col1\" class=\"data row3 col1\" >0.033058</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_afb59_row4_col0\" class=\"data row4 col0\" >133</td>\n",
" <td id=\"T_afb59_row4_col1\" class=\"data row4 col1\" >0.033058</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<style type=\"text/css\">\n",
"</style>\n",
"<table id=\"T_bb8df\" style='display:inline'>\n",
" <caption>PageRank</caption>\n",
" <thead>\n",
" <tr>\n",
" <th id=\"T_bb8df_level0_col0\" class=\"col_heading level0 col0\" >vertex</th>\n",
" <th id=\"T_bb8df_level0_col1\" class=\"col_heading level0 col1\" >pagerank</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td id=\"T_bb8df_row0_col0\" class=\"data row0 col0\" >24</td>\n",
" <td id=\"T_bb8df_row0_col1\" class=\"data row0 col1\" >0.002525</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_bb8df_row1_col0\" class=\"data row1 col0\" >72</td>\n",
" <td id=\"T_bb8df_row1_col1\" class=\"data row1 col1\" >0.002525</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_bb8df_row2_col0\" class=\"data row2 col0\" >86</td>\n",
" <td id=\"T_bb8df_row2_col1\" class=\"data row2 col1\" >0.002525</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_bb8df_row3_col0\" class=\"data row3 col0\" >127</td>\n",
" <td id=\"T_bb8df_row3_col1\" class=\"data row3 col1\" >0.002525</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_bb8df_row4_col0\" class=\"data row4 col0\" >133</td>\n",
" <td id=\"T_bb8df_row4_col1\" class=\"data row4 col1\" >0.002525</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"<style type=\"text/css\">\n",
"</style>\n",
"<table id=\"T_f5314\" style='display:inline'>\n",
" <caption>EigenVector</caption>\n",
" <thead>\n",
" <tr>\n",
" <th id=\"T_f5314_level0_col0\" class=\"col_heading level0 col0\" >vertex</th>\n",
" <th id=\"T_f5314_level0_col1\" class=\"col_heading level0 col1\" >eigenvector_centrality</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td id=\"T_f5314_row0_col0\" class=\"data row0 col0\" >24</td>\n",
" <td id=\"T_f5314_row0_col1\" class=\"data row0 col1\" >0.064086</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_f5314_row1_col0\" class=\"data row1 col0\" >72</td>\n",
" <td id=\"T_f5314_row1_col1\" class=\"data row1 col1\" >0.064086</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_f5314_row2_col0\" class=\"data row2 col0\" >86</td>\n",
" <td id=\"T_f5314_row2_col1\" class=\"data row2 col1\" >0.064086</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_f5314_row3_col0\" class=\"data row3 col0\" >127</td>\n",
" <td id=\"T_f5314_row3_col1\" class=\"data row3 col1\" >0.064086</td>\n",
" </tr>\n",
" <tr>\n",
" <td id=\"T_f5314_row4_col0\" class=\"data row4 col0\" >133</td>\n",
" <td id=\"T_f5314_row4_col1\" class=\"data row4 col1\" >0.064086</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"from IPython.display import display_html\n",
"dc_top = pd.DataFrame(sorted(d.items(), key=lambda x:x[1], reverse=True)[:5], columns=[\"vertex\", \"degree_centrality\"])\n",
"bc_top = pd.DataFrame(sorted(b.items(), key=lambda x:x[1], reverse=True)[:5], columns=[\"vertex\", \"betweenness_centrality\"])\n",
"katz_top = pd.DataFrame(sorted(k.items(), key=lambda x:x[1], reverse=True)[:5], columns=[\"vertex\", \"katz_centrality\"])\n",
"pr_top = pd.DataFrame(sorted(p.items(), key=lambda x:x[1], reverse=True)[:5], columns=[\"vertex\", \"pagerank\"])\n",
"ev_top = pd.DataFrame(sorted(e.items(), key=lambda x:x[1], reverse=True)[:5], columns=[\"vertex\", \"eigenvector_centrality\"])\n",
"\n",
"df1_styler = dc_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Degree').hide(axis='index')\n",
"df2_styler = bc_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Betweenness').hide(axis='index')\n",
"df3_styler = katz_top.style.set_table_attributes(\"style='display:inline'\").set_caption('Katz').hide(axis='index')\n",
"df4_styler = pr_top.style.set_table_attributes(\"style='display:inline'\").set_caption('PageRank').hide(axis='index')\n",
"df5_styler = ev_top.style.set_table_attributes(\"style='display:inline'\").set_caption('EigenVector').hide(axis='index')\n",
"\n",
"display_html(df1_styler._repr_html_()+df2_styler._repr_html_()+df3_styler._repr_html_()+df4_styler._repr_html_()+df5_styler._repr_html_(), raw=True)"
]
},
{
"cell_type": "markdown",
"id": "1a653ca9-9448-4ba5-85b2-f6c885c273a9",
"metadata": {},
"source": [
"### Exercise #1 - Type Dispatch ###\n",
"Use the type dispatching method to obtain pagerank centrality results with the cugraph backend."
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "6eb90078-1479-4847-97b7-eb119e9d5478",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Graph with 1406 nodes and 999 edges\n",
"CudaGraph with 1406 nodes and 999 edges\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>vertex</th>\n",
" <th>pagerank</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>24</td>\n",
" <td>0.002525</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>72</td>\n",
" <td>0.002525</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>86</td>\n",
" <td>0.002525</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>127</td>\n",
" <td>0.002525</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>133</td>\n",
" <td>0.002525</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" vertex pagerank\n",
"0 24 0.002525\n",
"1 72 0.002525\n",
"2 86 0.002525\n",
"3 127 0.002525\n",
"4 133 0.002525"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"import networkx as nx\n",
"import nx_cugraph as nxcg\n",
"\n",
"G = nx.from_pandas_edgelist(road_graph, source='src', target='dst', edge_attr='length')\n",
"nxcg_G = nxcg.from_networkx(G)\n",
"p = nx.pagerank(nxcg_G, max_iter=10, tol=1.0e-3)\n",
"\n",
"print(G)\n",
"print(nxcg_G)\n",
"\n",
"pd.DataFrame(sorted(p.items(), key=lambda x:x[1], reverse=True)[:5], columns=[\"vertex\", \"pagerank\"])"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "d70c78b7-551d-4d9e-b428-32b26adcd3c4",
"metadata": {},
"outputs": [],
"source": [
"import IPython\n",
"app = IPython.Application.instance()\n",
"app.kernel.do_shutdown(True)"
]
},
{
"cell_type": "markdown",
"id": "2279fdf1-82c0-4c6e-ac8e-b952f4777562",
"metadata": {},
"source": [
"**Well Done!** "
]
},
{
"cell_type": "markdown",
"id": "3fbc12b2-585c-48a9-a176-b2572040d378",
"metadata": {
"tags": []
},
"source": [
"<img src=\"./images/DLI_Header.png\" width=400/>"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

View File

@ -0,0 +1,426 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"./images/DLI_Header.png\" width=400/>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Fundamentals of Accelerated Data Science # "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 02 - K-Means ##\n",
"\n",
"**Table of Contents**\n",
"<br>\n",
"This notebook uses GPU-accelerated K-means to find the best locations for a fixed number of humanitarian supply airdrop depots. This notebook covers the below sections: \n",
"1. [Environment](#Environment)\n",
"2. [Load Data](#Load-Data)\n",
"3. [K-Means Clustering](#K-Means-Clustering)\n",
" * [Exercise #1 - Make Another `KMeans` Instance](#Exercise-#1---Make-Another-KMeans-Instance)\n",
"4. [Visualize the Clusters](#Visualize-the-Clusters)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Environment ##\n",
"For the first time we import `cuml`, the RAPIDS GPU-accelerated library containing many common machine learning algorithms. We will be visualizing the results of your work in this notebook, so we also import `cuxfilter`."
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"import cudf\n",
"import cuml\n",
"\n",
"import cuxfilter as cxf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Data ##\n",
"For this notebook we load again the cleaned UK population data--in this case, we are not specifically looking at counties, so we omit that column and just keep the grid coordinate columns."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"northing float64\n",
"easting float64\n",
"dtype: object\n"
]
},
{
"data": {
"text/plain": [
"(58479894, 2)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"gdf = cudf.read_csv('./data/clean_uk_pop.csv', usecols=['easting', 'northing'])\n",
"print(gdf.dtypes)\n",
"gdf.shape"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>northing</th>\n",
" <th>easting</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>515491.5313</td>\n",
" <td>430772.1875</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>503572.4688</td>\n",
" <td>434685.8750</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>517903.6563</td>\n",
" <td>432565.5313</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>517059.9063</td>\n",
" <td>427660.6250</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>509228.6875</td>\n",
" <td>425527.7813</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" northing easting\n",
"0 515491.5313 430772.1875\n",
"1 503572.4688 434685.8750\n",
"2 517903.6563 432565.5313\n",
"3 517059.9063 427660.6250\n",
"4 509228.6875 425527.7813"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gdf.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a name='#s2-3'></a>\n",
"## K-Means Clustering ##\n",
"The unsupervised K-means clustering algorithm will look for a fixed number *k* of centroids in the data and clusters each point with its closest centroid. K-means can be effective when the number of clusters *k* is known or has a good estimate (such as from a model of the underlying mechanics of a problem).\n",
"\n",
"Assume that in addition to knowing the distribution of the population, which we do, we would like to estimate the best locations to build a fixed number of humanitarian supply depots from which we can perform airdrops and reach the population most efficiently. We can use K-means, setting *k* to the number of supply depots available and fitting on the locations of the population, to identify candidate locations.\n",
"\n",
"GPU-accelerated K-means is just as easy as its CPU-only scikit-learn counterpart. In this series of exercises, you will use it to optimize the locations for 5 supply depots.\n",
"\n",
"`cuml.KMeans()` will initialize a K-means instance. Use it now to initialize a K-means instance called `km`, passing the named argument `n_clusters` set equal to our desired number `5`. Use the `km.fit` method to fit `km` to the population's locations by passing it the population data. After fitting, add the cluster labels back to the `gdf` in a new column named `cluster`. Finally, you can use `km.cluster_centers_` to see where the algorithm created the 5 centroids.\n",
"\n",
"Below we train a K-means clustering algorithm to find 5 clusters. "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>306647.898235</td>\n",
" <td>408370.452191</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>442109.465392</td>\n",
" <td>402673.747673</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>288997.149971</td>\n",
" <td>553805.430444</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>148770.463641</td>\n",
" <td>311786.805381</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>170553.110214</td>\n",
" <td>521605.459724</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1\n",
"0 306647.898235 408370.452191\n",
"1 442109.465392 402673.747673\n",
"2 288997.149971 553805.430444\n",
"3 148770.463641 311786.805381\n",
"4 170553.110214 521605.459724"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"# instantaite\n",
"km = cuml.KMeans(n_clusters=5)\n",
"\n",
"# fit\n",
"km.fit(gdf)\n",
"\n",
"# assign cluster as new column\n",
"gdf['cluster'] = km.labels_\n",
"km.cluster_centers_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a name='#s2-e1'></a>\n",
"## Exercise #1 - Make Another `KMeans` Instance ##\n",
"\n",
"**Instructions**: <br>\n",
"* Modify the `<FIXME>` only and execute the below cell to instantiate a K-means instance with 6 clusters.\n",
"* Modify the `<FIXME>` only and execute the cell below to fit the data. "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"km = cuml.KMeans(n_clusters=6)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"km.fit(gdf)\n",
"gdf['cluster'] = km.labels_\n",
"km.cluster_centers_"
]
},
{
"cell_type": "raw",
"metadata": {
"jupyter": {
"source_hidden": true
}
},
"source": [
"\n",
"km = cuml.KMeans(n_clusters=6)\n",
"\n",
"km.fit(gdf)\n",
"gdf['cluster'] = km.labels_\n",
"km.cluster_centers_"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Click ... for solution. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a id='#s2-4'></a>\n",
"## Visualize the Clusters ##\n",
"To help us understand where clusters are located, we make a visualization that separates them, using the same three steps as before.\n",
"\n",
"Below we plot the clusters with cuxfilter. "
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# DO NOT CHANGE THIS CELL\n",
"# associate a data source with cuXfilter\n",
"cxf_data = cxf.DataFrame.from_dataframe(gdf)\n",
"\n",
"# define charts\n",
"scatter_chart = cxf.charts.datashader.scatter(x='easting', y='northing')\n",
"\n",
"# define widget using the `cluster` column for multiselect\n",
"# use the same technique to scale the scatterplot, then add a widget to let us select which cluster to look at\n",
"cluster_widget = cxf.charts.panel_widgets.multi_select('cluster')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"# create dashboard\n",
"dash = cxf_data.dashboard(charts=[scatter_chart],sidebar=[cluster_widget], theme=cxf.themes.dark, data_size_widget=True)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"dash.app()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import IPython\n",
"app = IPython.Application.instance()\n",
"app.kernel.do_shutdown(True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Well Done!** Let's move to the [next notebook](3-03_dbscan.ipynb). "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"./images/DLI_Header.png\" width=400/>"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -0,0 +1,373 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"./images/DLI_Header.png\" width=400/>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Fundamentals of Accelerated Data Science # "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 03 - DBSCAN ##\n",
"\n",
"**Table of Contents**\n",
"<br>\n",
"This notebook uses GPU-accelerated DBSCAN to identify clusters of infected people. This notebook covers the below sections: \n",
"1. [Environment](#Environment)\n",
"2. [Load Data](#Load-Data)\n",
"3. [DBSCAN Clustering](#DBSCAN-Clustering)\n",
" * [Exercise #1 - Make Another DBSCAN Instance](#Exercise-#1---Make-Another-DBSCAN-Instance)\n",
"4. [Visualize the Clusters](#Visualize-the-Clusters)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Environment ##"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import cudf\n",
"import cuml\n",
"\n",
"import cuxfilter as cxf"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Data ##\n",
"For this notebook, we again load a subset of our population data with only the columns we need. An `infected` column has been added to the data to indicate whether or not a person is known to be infected with our simulated virus."
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"northing float32\n",
"easting float32\n",
"infected float32\n",
"dtype: object\n"
]
},
{
"data": {
"text/plain": [
"(1000000, 3)"
]
},
"execution_count": 2,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gdf = cudf.read_csv('./data/pop_sample.csv', dtype=['float32', 'float32', 'float32'])\n",
"print(gdf.dtypes)\n",
"gdf.shape"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>northing</th>\n",
" <th>easting</th>\n",
" <th>infected</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>178547.296875</td>\n",
" <td>368012.1250</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>174068.281250</td>\n",
" <td>543802.1250</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>358293.687500</td>\n",
" <td>435639.8750</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>87240.304688</td>\n",
" <td>389607.3750</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>158261.015625</td>\n",
" <td>340764.9375</td>\n",
" <td>0.0</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" northing easting infected\n",
"0 178547.296875 368012.1250 0.0\n",
"1 174068.281250 543802.1250 0.0\n",
"2 358293.687500 435639.8750 0.0\n",
"3 87240.304688 389607.3750 0.0\n",
"4 158261.015625 340764.9375 0.0"
]
},
"execution_count": 3,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gdf.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"infected\n",
"0.0 984331\n",
"1.0 15669\n",
"Name: count, dtype: int64"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gdf['infected'].value_counts()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## DBSCAN Clustering ##\n",
"DBSCAN is another unsupervised clustering algorithm that is particularly effective when the number of clusters is not known up front and the clusters may have concave or other unusual shapes--a situation that often applies in geospatial analytics.\n",
"\n",
"In this series of exercises you will use DBSCAN to identify clusters of infected people by location, which may help us identify groups becoming infected from common patient zeroes and assist in response planning.\n",
"\n",
"Create a DBSCAN instance by using `cuml.DBSCAN`. Pass in the named argument `eps` (the maximum distance a point can be from the nearest point in a cluster to be considered possibly in that cluster) to be `5000`. Since the `northing` and `easting` values we created are measured in meters, this will allow us to identify clusters of infected people where individuals may be separated from the rest of the cluster by up to 5 kilometers.\n",
"\n",
"Below we train a DBSCAN algorithm. We start by creating a new dataframe from rows of the original dataframe where `infected` is `1` (true), and call it `infected_df`--be sure to reset the dataframe's index afterward. Use `dbscan.fit_predict` to perform clustering on the `northing` and `easting` columns of `infected_df`, and turn the resulting series into a new column in `infected_gdf` called \"cluster\". Finally, compute the number of clusters identified by DBSCAN."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"96"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dbscan = cuml.DBSCAN(eps=5000)\n",
"# dbscan = cuml.DBSCAN(eps=10000)\n",
"\n",
"infected_df = gdf[gdf['infected'] == 1].reset_index()\n",
"infected_df['cluster'] = dbscan.fit_predict(infected_df[['northing', 'easting']])\n",
"infected_df['cluster'].nunique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Exercise #1 - Make Another DBSCAN Instance ###\n",
"\n",
"**Instructions**: <br>\n",
"* Modify the `<FIXME>` only and execute the below cell to instantiate a DBSCAN instance with `10000` for `eps`.\n",
"* Modify the `<FIXME>` only and execute the cell below to fit the data and identify infected clusters. "
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [],
"source": [
"dbscan = cuml.DBSCAN(eps=10000)"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"10"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"infected_df = gdf[gdf['infected'] == 1].reset_index()\n",
"infected_df['cluster'] = dbscan.fit_predict(infected_df[['northing', 'easting']])\n",
"infected_df['cluster'].nunique()"
]
},
{
"cell_type": "raw",
"metadata": {
"jupyter": {
"source_hidden": true
}
},
"source": [
"\n",
"dbscan = cuml.DBSCAN(eps=10000)\n",
"\n",
"infected_df = gdf[gdf['infected'] == 1].reset_index()\n",
"infected_df['cluster'] = dbscan.fit_predict(infected_df[['northing', 'easting']])\n",
"infected_df['cluster'].nunique()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Click ... for solution. "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Visualize the Clusters"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Because we have the same column names as in the K-means example--`easting`, `northing`, and `cluster`--we can use the same code to visualize the clusters."
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"infected_df.to_pandas().plot(kind='scatter', x='easting', y='northing', c='cluster')"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import IPython\n",
"app = IPython.Application.instance()\n",
"app.kernel.do_shutdown(True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Well Done!** Let's move to the [next notebook](3-04_logistic_regression.ipynb). "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"./images/DLI_Header.png\" width=400/>"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,673 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"./images/DLI_Header.png\" width=400/>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Fundamentals of Accelerated Data Science # "
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## 08 - Multi-GPU K-Means with Dask ##\n",
"\n",
"**Table of Contents**\n",
"<br>\n",
"This notebook uses GPU-accelerated K-means to identify population clusters in a multi-node, multi-GPU scalable way with Dask. This notebook covers the below sections: \n",
"1. [Environment](#Environment)\n",
"2. [Load and Persist Data](#Load-and-Persist-Data)\n",
"3. [Training the Model](#Training-the-Model)\n",
" * [Exercise #1 - Count Members of the Southernmost Cluster](#Exercise-#1---Count-Members-of-the-Southernmost-Cluster)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Environment ##\n",
"First we import the needed modules to create a Dask cuDF cluster. As we did before, we need to import CUDA context creators after setting up the cluster so they don't lock to a single device. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import subprocess\n",
"import logging\n",
"\n",
"from dask.distributed import Client, wait, progress\n",
"from dask_cuda import LocalCUDACluster"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"import cudf\n",
"import dask_cudf\n",
"\n",
"import cuml\n",
"from cuml.dask.cluster import KMeans"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"# create cluster\n",
"cmd = \"hostname --all-ip-addresses\"\n",
"process = subprocess.Popen(cmd.split(), stdout=subprocess.PIPE)\n",
"output, error = process.communicate()\n",
"IPADDR = str(output.decode()).split()[0]\n",
"\n",
"cluster = LocalCUDACluster(ip=IPADDR, silence_logs=logging.ERROR)\n",
"client = Client(cluster)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load and Persist Data ##\n",
"We will begin by loading the data, The data set has the two grid coordinate columns, `easting` and `northing`, derived from the main population data set we have prepared."
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"ddf = dask_cudf.read_csv('./data/uk_pop5x_coords.csv', dtype=['float32', 'float32'])"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Training the Model ##\n",
"Training the K-means model is very similar to both the scikit-learn version and the cuML single-GPU version--by setting up the client and importing from the `cuml.dask.cluster` module, the algorithm will automatically use the local Dask cluster we have set up.\n",
"\n",
"Note that calling `.fit` triggers Dask computation.\n",
"\n",
"Once we have the fit model, we extract the cluster centers and rename the columns from their generic `0` and `1` to reflect the data on which they were trained."
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 5.24 s, sys: 2.48 s, total: 7.72 s\n",
"Wall time: 1min 54s\n"
]
},
{
"data": {
"text/html": [
"<style>#sk-container-id-1 {\n",
" /* Definition of color scheme common for light and dark mode */\n",
" --sklearn-color-text: black;\n",
" --sklearn-color-line: gray;\n",
" /* Definition of color scheme for unfitted estimators */\n",
" --sklearn-color-unfitted-level-0: #fff5e6;\n",
" --sklearn-color-unfitted-level-1: #f6e4d2;\n",
" --sklearn-color-unfitted-level-2: #ffe0b3;\n",
" --sklearn-color-unfitted-level-3: chocolate;\n",
" /* Definition of color scheme for fitted estimators */\n",
" --sklearn-color-fitted-level-0: #f0f8ff;\n",
" --sklearn-color-fitted-level-1: #d4ebff;\n",
" --sklearn-color-fitted-level-2: #b3dbfd;\n",
" --sklearn-color-fitted-level-3: cornflowerblue;\n",
"\n",
" /* Specific color for light theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, white)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, black)));\n",
" --sklearn-color-icon: #696969;\n",
"\n",
" @media (prefers-color-scheme: dark) {\n",
" /* Redefinition of color scheme for dark theme */\n",
" --sklearn-color-text-on-default-background: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-background: var(--sg-background-color, var(--theme-background, var(--jp-layout-color0, #111)));\n",
" --sklearn-color-border-box: var(--sg-text-color, var(--theme-code-foreground, var(--jp-content-font-color1, white)));\n",
" --sklearn-color-icon: #878787;\n",
" }\n",
"}\n",
"\n",
"#sk-container-id-1 {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"#sk-container-id-1 pre {\n",
" padding: 0;\n",
"}\n",
"\n",
"#sk-container-id-1 input.sk-hidden--visually {\n",
" border: 0;\n",
" clip: rect(1px 1px 1px 1px);\n",
" clip: rect(1px, 1px, 1px, 1px);\n",
" height: 1px;\n",
" margin: -1px;\n",
" overflow: hidden;\n",
" padding: 0;\n",
" position: absolute;\n",
" width: 1px;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-dashed-wrapped {\n",
" border: 1px dashed var(--sklearn-color-line);\n",
" margin: 0 0.4em 0.5em 0.4em;\n",
" box-sizing: border-box;\n",
" padding-bottom: 0.4em;\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-container {\n",
" /* jupyter's `normalize.less` sets `[hidden] { display: none; }`\n",
" but bootstrap.min.css set `[hidden] { display: none !important; }`\n",
" so we also need the `!important` here to be able to override the\n",
" default hidden behavior on the sphinx rendered scikit-learn.org.\n",
" See: https://github.com/scikit-learn/scikit-learn/issues/21755 */\n",
" display: inline-block !important;\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-text-repr-fallback {\n",
" display: none;\n",
"}\n",
"\n",
"div.sk-parallel-item,\n",
"div.sk-serial,\n",
"div.sk-item {\n",
" /* draw centered vertical line to link estimators */\n",
" background-image: linear-gradient(var(--sklearn-color-text-on-default-background), var(--sklearn-color-text-on-default-background));\n",
" background-size: 2px 100%;\n",
" background-repeat: no-repeat;\n",
" background-position: center center;\n",
"}\n",
"\n",
"/* Parallel-specific style estimator block */\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item::after {\n",
" content: \"\";\n",
" width: 100%;\n",
" border-bottom: 2px solid var(--sklearn-color-text-on-default-background);\n",
" flex-grow: 1;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel {\n",
" display: flex;\n",
" align-items: stretch;\n",
" justify-content: center;\n",
" background-color: var(--sklearn-color-background);\n",
" position: relative;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item {\n",
" display: flex;\n",
" flex-direction: column;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item:first-child::after {\n",
" align-self: flex-end;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item:last-child::after {\n",
" align-self: flex-start;\n",
" width: 50%;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-parallel-item:only-child::after {\n",
" width: 0;\n",
"}\n",
"\n",
"/* Serial-specific style estimator block */\n",
"\n",
"#sk-container-id-1 div.sk-serial {\n",
" display: flex;\n",
" flex-direction: column;\n",
" align-items: center;\n",
" background-color: var(--sklearn-color-background);\n",
" padding-right: 1em;\n",
" padding-left: 1em;\n",
"}\n",
"\n",
"\n",
"/* Toggleable style: style used for estimator/Pipeline/ColumnTransformer box that is\n",
"clickable and can be expanded/collapsed.\n",
"- Pipeline and ColumnTransformer use this feature and define the default style\n",
"- Estimators will overwrite some part of the style using the `sk-estimator` class\n",
"*/\n",
"\n",
"/* Pipeline and ColumnTransformer style (default) */\n",
"\n",
"#sk-container-id-1 div.sk-toggleable {\n",
" /* Default theme specific background. It is overwritten whether we have a\n",
" specific estimator or a Pipeline/ColumnTransformer */\n",
" background-color: var(--sklearn-color-background);\n",
"}\n",
"\n",
"/* Toggleable label */\n",
"#sk-container-id-1 label.sk-toggleable__label {\n",
" cursor: pointer;\n",
" display: block;\n",
" width: 100%;\n",
" margin-bottom: 0;\n",
" padding: 0.5em;\n",
" box-sizing: border-box;\n",
" text-align: center;\n",
"}\n",
"\n",
"#sk-container-id-1 label.sk-toggleable__label-arrow:before {\n",
" /* Arrow on the left of the label */\n",
" content: \"▸\";\n",
" float: left;\n",
" margin-right: 0.25em;\n",
" color: var(--sklearn-color-icon);\n",
"}\n",
"\n",
"#sk-container-id-1 label.sk-toggleable__label-arrow:hover:before {\n",
" color: var(--sklearn-color-text);\n",
"}\n",
"\n",
"/* Toggleable content - dropdown */\n",
"\n",
"#sk-container-id-1 div.sk-toggleable__content {\n",
" max-height: 0;\n",
" max-width: 0;\n",
" overflow: hidden;\n",
" text-align: left;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-toggleable__content.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-toggleable__content pre {\n",
" margin: 0.2em;\n",
" border-radius: 0.25em;\n",
" color: var(--sklearn-color-text);\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-toggleable__content.fitted pre {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 input.sk-toggleable__control:checked~div.sk-toggleable__content {\n",
" /* Expand drop-down */\n",
" max-height: 200px;\n",
" max-width: 100%;\n",
" overflow: auto;\n",
"}\n",
"\n",
"#sk-container-id-1 input.sk-toggleable__control:checked~label.sk-toggleable__label-arrow:before {\n",
" content: \"▾\";\n",
"}\n",
"\n",
"/* Pipeline/ColumnTransformer-specific style */\n",
"\n",
"#sk-container-id-1 div.sk-label input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-label.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator-specific style */\n",
"\n",
"/* Colorize estimator box */\n",
"#sk-container-id-1 div.sk-estimator input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-estimator.fitted input.sk-toggleable__control:checked~label.sk-toggleable__label {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-label label.sk-toggleable__label,\n",
"#sk-container-id-1 div.sk-label label {\n",
" /* The background is the default theme color */\n",
" color: var(--sklearn-color-text-on-default-background);\n",
"}\n",
"\n",
"/* On hover, darken the color of the background */\n",
"#sk-container-id-1 div.sk-label:hover label.sk-toggleable__label {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"/* Label box, darken color on hover, fitted */\n",
"#sk-container-id-1 div.sk-label.fitted:hover label.sk-toggleable__label.fitted {\n",
" color: var(--sklearn-color-text);\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Estimator label */\n",
"\n",
"#sk-container-id-1 div.sk-label label {\n",
" font-family: monospace;\n",
" font-weight: bold;\n",
" display: inline-block;\n",
" line-height: 1.2em;\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-label-container {\n",
" text-align: center;\n",
"}\n",
"\n",
"/* Estimator-specific */\n",
"#sk-container-id-1 div.sk-estimator {\n",
" font-family: monospace;\n",
" border: 1px dotted var(--sklearn-color-border-box);\n",
" border-radius: 0.25em;\n",
" box-sizing: border-box;\n",
" margin-bottom: 0.5em;\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-0);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-estimator.fitted {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-0);\n",
"}\n",
"\n",
"/* on hover */\n",
"#sk-container-id-1 div.sk-estimator:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-2);\n",
"}\n",
"\n",
"#sk-container-id-1 div.sk-estimator.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-2);\n",
"}\n",
"\n",
"/* Specification for estimator info (e.g. \"i\" and \"?\") */\n",
"\n",
"/* Common style for \"i\" and \"?\" */\n",
"\n",
".sk-estimator-doc-link,\n",
"a:link.sk-estimator-doc-link,\n",
"a:visited.sk-estimator-doc-link {\n",
" float: right;\n",
" font-size: smaller;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1em;\n",
" height: 1em;\n",
" width: 1em;\n",
" text-decoration: none !important;\n",
" margin-left: 1ex;\n",
" /* unfitted */\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted,\n",
"a:link.sk-estimator-doc-link.fitted,\n",
"a:visited.sk-estimator-doc-link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"div.sk-estimator:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link:hover,\n",
".sk-estimator-doc-link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"div.sk-estimator.fitted:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover,\n",
"div.sk-label-container:hover .sk-estimator-doc-link.fitted:hover,\n",
".sk-estimator-doc-link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"/* Span, style for the box shown on hovering the info icon */\n",
".sk-estimator-doc-link span {\n",
" display: none;\n",
" z-index: 9999;\n",
" position: relative;\n",
" font-weight: normal;\n",
" right: .2ex;\n",
" padding: .5ex;\n",
" margin: .5ex;\n",
" width: min-content;\n",
" min-width: 20ex;\n",
" max-width: 50ex;\n",
" color: var(--sklearn-color-text);\n",
" box-shadow: 2pt 2pt 4pt #999;\n",
" /* unfitted */\n",
" background: var(--sklearn-color-unfitted-level-0);\n",
" border: .5pt solid var(--sklearn-color-unfitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link.fitted span {\n",
" /* fitted */\n",
" background: var(--sklearn-color-fitted-level-0);\n",
" border: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"\n",
".sk-estimator-doc-link:hover span {\n",
" display: block;\n",
"}\n",
"\n",
"/* \"?\"-specific style due to the `<a>` HTML tag */\n",
"\n",
"#sk-container-id-1 a.estimator_doc_link {\n",
" float: right;\n",
" font-size: 1rem;\n",
" line-height: 1em;\n",
" font-family: monospace;\n",
" background-color: var(--sklearn-color-background);\n",
" border-radius: 1rem;\n",
" height: 1rem;\n",
" width: 1rem;\n",
" text-decoration: none;\n",
" /* unfitted */\n",
" color: var(--sklearn-color-unfitted-level-1);\n",
" border: var(--sklearn-color-unfitted-level-1) 1pt solid;\n",
"}\n",
"\n",
"#sk-container-id-1 a.estimator_doc_link.fitted {\n",
" /* fitted */\n",
" border: var(--sklearn-color-fitted-level-1) 1pt solid;\n",
" color: var(--sklearn-color-fitted-level-1);\n",
"}\n",
"\n",
"/* On hover */\n",
"#sk-container-id-1 a.estimator_doc_link:hover {\n",
" /* unfitted */\n",
" background-color: var(--sklearn-color-unfitted-level-3);\n",
" color: var(--sklearn-color-background);\n",
" text-decoration: none;\n",
"}\n",
"\n",
"#sk-container-id-1 a.estimator_doc_link.fitted:hover {\n",
" /* fitted */\n",
" background-color: var(--sklearn-color-fitted-level-3);\n",
"}\n",
"</style><div id=\"sk-container-id-1\" class=\"sk-top-container\"><div class=\"sk-text-repr-fallback\"><pre>KMeansMG()</pre><b>In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook. <br />On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.</b></div><div class=\"sk-container\" hidden><div class=\"sk-item\"><div class=\"sk-estimator fitted sk-toggleable\"><input class=\"sk-toggleable__control sk-hidden--visually\" id=\"sk-estimator-id-1\" type=\"checkbox\" checked><label for=\"sk-estimator-id-1\" class=\"sk-toggleable__label fitted sk-toggleable__label-arrow fitted\">&nbsp;KMeansMG<span class=\"sk-estimator-doc-link fitted\">i<span>Fitted</span></span></label><div class=\"sk-toggleable__content fitted\"><pre>KMeansMG()</pre></div> </div></div></div></div>"
],
"text/plain": [
"KMeansMG()"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"%%time\n",
"dkm = KMeans(n_clusters=20)\n",
"dkm.fit(ddf)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"northing float32\n",
"easting float32\n",
"dtype: object"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cluster_centers = dkm.cluster_centers_\n",
"cluster_centers.columns = ddf.columns\n",
"cluster_centers.dtypes"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Exercise #1 - Count Members of the Southernmost Cluster ###\n",
"Using the `cluster_centers`, identify which cluster is the southernmost (has the lowest `northing` value) with the `nsmallest` method, then use `dkm.predict` to get labels for the data, and finally filter the labels to determine how many individuals the model estimated were in that cluster. \n",
"\n",
"**Instructions**: <br>\n",
"* Modify the `<FIXME>` only and execute the below cell to estimate the number of individuals in the southernmost cluster. "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"31435157"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"south_idx = cluster_centers.nsmallest(1, 'northing').index[0]\n",
"labels_predicted = dkm.predict(ddf)\n",
"labels_predicted[labels_predicted==south_idx].compute().shape[0]"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"{'status': 'ok', 'restart': True}"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"154087362\n",
"144014032\n",
"131789736\n",
"154907810\n"
]
}
],
"source": [
"import IPython\n",
"app = IPython.Application.instance()\n",
"app.kernel.do_shutdown(True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Well Done!**"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<img src=\"./images/DLI_Header.png\" width=400/>"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

View File

@ -0,0 +1,818 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a href=\"https://www.nvidia.com/dli\"><img src=\"images/DLI_Header.png\" alt=\"Header\" style=\"width: 400px;\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Week 1: Find Clusters of Infected People"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<span style=\"color:red\">\n",
"**URGENT WARNING**\n",
"\n",
"We have been receiving reports from health facilities that a new, fast-spreading virus has been discovered in the population. To prepare our response, we need to understand the geospatial distribution of those who have been infected. Find out whether there are identifiable clusters of infected individuals and where they are. \n",
"</span>\n",
"\n",
"Your goal for this notebook will be to estimate the location of dense geographic clusters of infected people using incoming data from week 1 of the simulated epidemic."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Imports"
]
},
{
"cell_type": "code",
"execution_count": 65,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"The cudf.pandas extension is already loaded. To reload it, use:\n",
" %reload_ext cudf.pandas\n"
]
}
],
"source": [
"%load_ext cudf.pandas\n",
"import pandas as pd\n",
"import cuml\n",
"\n",
"import cupy as cp"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Load Data"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Begin by loading the data you've received about week 1 of the outbreak into a cuDF-accelerated pandas DataFrame. The data is located at `'./data/week1.csv'`. For this notebook you will only need the `'lat'`, `'long'`, and `'infected'` columns. Either drop the columns after loading, or use the `pd.read_csv` named argument `usecols` to provide a list of only the columns you need."
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {
"scrolled": true
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>infected</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>54.522511</td>\n",
" <td>-1.571896</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>54.554031</td>\n",
" <td>-1.524968</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>54.552483</td>\n",
" <td>-1.435203</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>54.537186</td>\n",
" <td>-1.566215</td>\n",
" <td>False</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>54.528210</td>\n",
" <td>-1.588462</td>\n",
" <td>False</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" lat long infected\n",
"0 54.522511 -1.571896 False\n",
"1 54.554031 -1.524968 False\n",
"2 54.552483 -1.435203 False\n",
"3 54.537186 -1.566215 False\n",
"4 54.528210 -1.588462 False"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df = pd.read_csv('./data/week1.csv', dtype = {\n",
" 'lat': 'float32',\n",
" 'long': 'float32',\n",
" 'infected': 'category',\n",
"}, usecols = ['lat', 'long', 'infected'])\n",
"df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Make Data Frame of the Infected"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Make a new DataFrame `infected_df` that contains only the infected members of the population.\n",
"\n",
"**Tip**: Reset the index of `infected_df` with `.reset_index(drop=True)`. "
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[28928759 28930512 28930904 ... 57410428 57411005 57411919]\n",
"[ 0 1 2 ... 18145 18146 18147]\n"
]
},
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>infected</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>54.472767</td>\n",
" <td>-1.654932</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>54.529720</td>\n",
" <td>-1.667143</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>54.512981</td>\n",
" <td>-1.589866</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>54.522320</td>\n",
" <td>-1.380694</td>\n",
" <td>True</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>54.541656</td>\n",
" <td>-1.613490</td>\n",
" <td>True</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" lat long infected\n",
"0 54.472767 -1.654932 True\n",
"1 54.529720 -1.667143 True\n",
"2 54.512981 -1.589866 True\n",
"3 54.522320 -1.380694 True\n",
"4 54.541656 -1.613490 True"
]
},
"execution_count": 13,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"infected_df = df[df['infected'] == True]\n",
"print(infected_df.index.values)\n",
"\n",
"infected_df = infected_df.reset_index(drop=True)\n",
"\n",
"print(infected_df.index.values)\n",
"infected_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Make Grid Coordinates for Infected Locations"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Provided for you in the next cell (which you can expand by clicking on the \"...\" and contract again after executing by clicking on the blue left border of the cell) is the lat/long to OSGB36 grid coordinates converter you used earlier in the workshop. Use this converter to create grid coordinate values stored in `northing` and `easting` columns of the `infected_df` you created in the last step."
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {
"jupyter": {
"source_hidden": true
}
},
"outputs": [],
"source": [
"# https://www.ordnancesurvey.co.uk/docs/support/guide-coordinate-systems-great-britain.pdf\n",
"\n",
"def latlong2osgbgrid_cupy(lat, long, input_degrees=True):\n",
" '''\n",
" Converts latitude and longitude (ellipsoidal) coordinates into northing and easting (grid) coordinates, using a Transverse Mercator projection.\n",
" \n",
" Inputs:\n",
" lat: latitude coordinate (N)\n",
" long: longitude coordinate (E)\n",
" input_degrees: if True (default), interprets the coordinates as degrees; otherwise, interprets coordinates as radians\n",
" \n",
" Output:\n",
" (northing, easting)\n",
" '''\n",
" \n",
" if input_degrees:\n",
" lat = lat * cp.pi/180\n",
" long = long * cp.pi/180\n",
"\n",
" a = 6377563.396\n",
" b = 6356256.909\n",
" e2 = (a**2 - b**2) / a**2\n",
"\n",
" N0 = -100000 # northing of true origin\n",
" E0 = 400000 # easting of true origin\n",
" F0 = .9996012717 # scale factor on central meridian\n",
" phi0 = 49 * cp.pi / 180 # latitude of true origin\n",
" lambda0 = -2 * cp.pi / 180 # longitude of true origin and central meridian\n",
" \n",
" sinlat = cp.sin(lat)\n",
" coslat = cp.cos(lat)\n",
" tanlat = cp.tan(lat)\n",
" \n",
" latdiff = lat-phi0\n",
" longdiff = long-lambda0\n",
"\n",
" n = (a-b) / (a+b)\n",
" nu = a * F0 * (1 - e2 * sinlat ** 2) ** -.5\n",
" rho = a * F0 * (1 - e2) * (1 - e2 * sinlat ** 2) ** -1.5\n",
" eta2 = nu / rho - 1\n",
" M = b * F0 * ((1 + n + 5/4 * (n**2 + n**3)) * latdiff - \n",
" (3*(n+n**2) + 21/8 * n**3) * cp.sin(latdiff) * cp.cos(lat+phi0) +\n",
" 15/8 * (n**2 + n**3) * cp.sin(2*(latdiff)) * cp.cos(2*(lat+phi0)) - \n",
" 35/24 * n**3 * cp.sin(3*(latdiff)) * cp.cos(3*(lat+phi0)))\n",
" I = M + N0\n",
" II = nu/2 * sinlat * coslat\n",
" III = nu/24 * sinlat * coslat ** 3 * (5 - tanlat ** 2 + 9 * eta2)\n",
" IIIA = nu/720 * sinlat * coslat ** 5 * (61-58 * tanlat**2 + tanlat**4)\n",
" IV = nu * coslat\n",
" V = nu / 6 * coslat**3 * (nu/rho - cp.tan(lat)**2)\n",
" VI = nu / 120 * coslat ** 5 * (5 - 18 * tanlat**2 + tanlat**4 + 14 * eta2 - 58 * tanlat**2 * eta2)\n",
"\n",
" northing = I + II * longdiff**2 + III * longdiff**4 + IIIA * longdiff**6\n",
" easting = E0 + IV * longdiff + V * longdiff**3 + VI * longdiff**5\n",
"\n",
" return(northing, easting)"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>lat</th>\n",
" <th>long</th>\n",
" <th>infected</th>\n",
" <th>northing</th>\n",
" <th>easting</th>\n",
" <th>cluster</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>54.472767</td>\n",
" <td>-1.654932</td>\n",
" <td>True</td>\n",
" <td>508670.609809</td>\n",
" <td>422359.747233</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>54.529720</td>\n",
" <td>-1.667143</td>\n",
" <td>True</td>\n",
" <td>515003.452959</td>\n",
" <td>421538.534748</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>54.512981</td>\n",
" <td>-1.589866</td>\n",
" <td>True</td>\n",
" <td>513167.311551</td>\n",
" <td>426549.871569</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>54.522320</td>\n",
" <td>-1.380694</td>\n",
" <td>True</td>\n",
" <td>514305.528712</td>\n",
" <td>440081.234190</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>54.541656</td>\n",
" <td>-1.613490</td>\n",
" <td>True</td>\n",
" <td>516349.193146</td>\n",
" <td>425002.998690</td>\n",
" <td>-1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" lat long infected northing easting cluster\n",
"0 54.472767 -1.654932 True 508670.609809 422359.747233 -1\n",
"1 54.529720 -1.667143 True 515003.452959 421538.534748 -1\n",
"2 54.512981 -1.589866 True 513167.311551 426549.871569 -1\n",
"3 54.522320 -1.380694 True 514305.528712 440081.234190 -1\n",
"4 54.541656 -1.613490 True 516349.193146 425002.998690 -1"
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"cupy_lat = cp.asarray(infected_df['lat'])\n",
"cupy_long = cp.asarray(infected_df['long'])\n",
"\n",
"infected_df['northing'], infected_df['easting'] = latlong2osgbgrid_cupy(cupy_lat, cupy_long)\n",
"infected_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Find Clusters of Infected People"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use DBSCAN to find clusters of at least 25 infected people where no member is more than 2000m from at least one other cluster member. Create a new column in `infected_df` which contains the cluster to which each infected person belongs."
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"<cudf.core.groupby.groupby.DataFrameGroupBy object at 0x7f55ea949240>"
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"dbscan = cuml.DBSCAN(eps = 2000, min_samples = 25)\n",
"infected_df['cluster'] = dbscan.fit_predict(infected_df[['northing', 'easting']])\n",
"infected_df.groupby('cluster')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Find the Centroid of Each Cluster"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use grouping to find the mean `northing` and `easting` values for each cluster identified above."
]
},
{
"cell_type": "code",
"execution_count": 71,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>northing</th>\n",
" <th>easting</th>\n",
" </tr>\n",
" <tr>\n",
" <th>cluster</th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>-1</th>\n",
" <td>378094.622647</td>\n",
" <td>401880.682473</td>\n",
" </tr>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>397661.319575</td>\n",
" <td>371410.021738</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>436475.527827</td>\n",
" <td>332980.449214</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>347062.477357</td>\n",
" <td>389386.823243</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>359668.552556</td>\n",
" <td>379638.020362</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>391630.403390</td>\n",
" <td>431158.137254</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>386471.397432</td>\n",
" <td>426559.085587</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>434970.462486</td>\n",
" <td>406985.278520</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>412772.652344</td>\n",
" <td>410069.663793</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>415808.971615</td>\n",
" <td>414713.750256</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>417322.530166</td>\n",
" <td>409583.737652</td>\n",
" </tr>\n",
" <tr>\n",
" <th>10</th>\n",
" <td>334208.471668</td>\n",
" <td>435937.777721</td>\n",
" </tr>\n",
" <tr>\n",
" <th>11</th>\n",
" <td>300568.023792</td>\n",
" <td>391901.514790</td>\n",
" </tr>\n",
" <tr>\n",
" <th>12</th>\n",
" <td>291539.540205</td>\n",
" <td>401640.663845</td>\n",
" </tr>\n",
" <tr>\n",
" <th>13</th>\n",
" <td>289855.069902</td>\n",
" <td>394518.295606</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" northing easting\n",
"cluster \n",
"-1 378094.622647 401880.682473\n",
" 0 397661.319575 371410.021738\n",
" 1 436475.527827 332980.449214\n",
" 2 347062.477357 389386.823243\n",
" 3 359668.552556 379638.020362\n",
" 4 391630.403390 431158.137254\n",
" 5 386471.397432 426559.085587\n",
" 6 434970.462486 406985.278520\n",
" 7 412772.652344 410069.663793\n",
" 8 415808.971615 414713.750256\n",
" 9 417322.530166 409583.737652\n",
" 10 334208.471668 435937.777721\n",
" 11 300568.023792 391901.514790\n",
" 12 291539.540205 401640.663845\n",
" 13 289855.069902 394518.295606"
]
},
"execution_count": 71,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"centroids_df = infected_df[['northing', 'easting', 'cluster']].groupby('cluster').mean()\n",
"centroids_df"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Find the number of people in each cluster by counting the number of appearances of each cluster's label in the column produced by DBSCAN."
]
},
{
"cell_type": "code",
"execution_count": 72,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"cluster\n",
"-1 8451\n",
" 0 8638\n",
" 1 68\n",
" 2 403\n",
" 3 25\n",
" 4 66\n",
" 5 43\n",
" 6 27\n",
" 7 39\n",
" 8 92\n",
" 9 21\n",
" 10 64\n",
" 11 68\n",
" 12 72\n",
" 13 71\n",
"dtype: int64"
]
},
"execution_count": 72,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"infected_df.groupby(['cluster']).size()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Find the Centroid of the Cluster with the Most Members ##"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Use the cluster label for with the most people to filter `centroid_df` and write the answer to `my_assessment/question_1.json`. "
]
},
{
"cell_type": "code",
"execution_count": 78,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.10/site-packages/cudf/io/json.py:194: UserWarning: Using CPU via Pandas to write JSON dataset\n",
" warnings.warn(\"Using CPU via Pandas to write JSON dataset\")\n"
]
}
],
"source": [
"centroids_df.loc[0].to_json('my_assessment/question_1.json')"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Check Submission ##"
]
},
{
"cell_type": "code",
"execution_count": 79,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"{\"northing\":397661.3195752321,\"easting\":371410.0217381102}"
]
}
],
"source": [
"!cat my_assessment/question_1.json"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"**Tip**: Your submission file should contain one line of text, similar to: \n",
"\n",
"```\n",
"{'northing':XXX.XX,'easting':XXX.XX}\n",
"```"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<div align=\"center\"><h2>Please Restart the Kernel</h2></div>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import IPython\n",
"app = IPython.Application.instance()\n",
"app.kernel.do_shutdown(True)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"<a href=\"https://www.nvidia.com/dli\"><img src=\"images/DLI_Header.png\" alt=\"Header\" style=\"width: 400px;\"/></a>"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.15"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,172 @@
county,lat_county_center,long_county_center
BARKING AND DAGENHAM,51.621048311776526,0.12958319845588165
BARNET,51.81255163972051,-0.21821206632197684
BARNSLEY,53.57190690010971,-1.5487193565226611
BATH AND NORTH EAST SOMERSET,51.35496548780361,-2.486675162410336
BEDFORD,52.145475839485385,-0.4549734374180617
BEXLEY,51.33625605642689,0.14633321710015448
BIRMINGHAM,52.12178304394528,-1.881329432771379
BLACKBURN WITH DARWEN,53.63718763008419,-2.463700844959783
BLACKPOOL,53.882118373353435,-3.0229009637127167
BLAENAU GWENT,51.75159582861159,-3.1862426125686745
BOLTON,53.73813128127497,-2.4794091133678147
BRACKNELL FOREST,51.457925145468295,-0.7336441271286038
BRADFORD,53.972113267048044,-1.8738762931122748
BRENT,51.761695309784,-0.2756927203781798
BRIDGEND,51.522888539164526,-3.6137468421270604
BRIGHTON AND HOVE,50.94890407892698,-0.1507807253912774
"BRISTOL, CITY OF",51.53203785026057,-2.5774864859032594
BROMLEY,51.2251371203518,0.03905163114984023
BUCKINGHAMSHIRE,51.92925587759856,-0.8053996183750294
BURY,53.61553432785575,-2.3088650595977023
CAERPHILLY,51.62781255006381,-3.1973649865483735
CALDERDALE,53.769761331289686,-1.9616103771384508
CAMBRIDGESHIRE,52.1333820427886,-0.23503728806014595
CAMDEN,51.69346289078886,-0.1629412552292679
CARDIFF,51.56635588939404,-3.222317281083218
CARMARTHENSHIRE,51.92106862577838,-4.211293704149962
CENTRAL BEDFORDSHIRE,51.99983427713095,-0.4775810785914261
CEREDIGION,52.297905934896974,-3.9524382809074967
CHESHIRE EAST,53.209779668583735,-2.2923524120906538
CHESHIRE WEST AND CHESTER,53.12468649229667,-2.703640874356098
CITY OF LONDON,51.515869084539396,-0.09345024349003202
CONWY,53.125451225027945,-3.7469275629154897
CORNWALL,50.2491094902892,-4.642072961722217
COUNTY DURHAM,54.46928915708376,-1.840983172985692
COVENTRY,52.20619163815314,-1.5190329484575433
CROYDON,51.33122440611814,-0.07773715861848832
CUMBRIA,54.470582575648244,-2.902600383252353
DARLINGTON,54.51355967194039,-1.5680201999230523
DENBIGHSHIRE,53.07313542431554,-3.347662396412462
DERBY,52.98317870391253,-1.471762916352353
DERBYSHIRE,52.96237103431297,-1.6019383162802616
DEVON,50.75993290464059,-3.6572707805745353
DONCASTER,53.579077870304175,-1.1091519021581622
DORSET,50.80117614559981,-2.4141088997141975
DUDLEY,52.466075739334926,-2.101688961593882
EALING,51.69946371446451,-0.31413253292570953
EAST RIDING OF YORKSHIRE,53.9506321883079,-0.6619808168243948
EAST SUSSEX,50.8319515317622,0.33441692286193403
ENFIELD,51.79829813489722,-0.08133941451400101
ESSEX,51.61177562858481,0.5408806396014519
FLINTSHIRE,53.18448452051185,-3.176529270275655
GATESHEAD,54.984104331680726,-1.6867966327256207
GLOUCESTERSHIRE,51.95116469210396,-2.152140175011601
GREENWICH,51.298529627584855,0.05009798110429057
GWYNEDD,52.90798692199907,-3.815807248465912
HACKNEY,51.715573990309835,-0.06047668080560671
HALTON,53.37945371869939,-2.6885285111965866
HAMMERSMITH AND FULHAM,51.45669431471315,-0.21734862391196488
HAMPSHIRE,51.35882747857323,-1.2472236572124424
HARINGEY,51.71488485869694,-0.10670896820865851
HARROW,51.69502976226169,-0.3360141730528605
HARTLEPOOL,54.67019690697325,-1.2702881849113061
HAVERING,51.68803382335829,0.23538931286606415
"HEREFORDSHIRE, COUNTY OF",52.05661428266539,-2.7394973894756567
HERTFORDSHIRE,51.97545351306396,-0.2768104374496038
HILLINGDON,51.67744993832507,-0.44168376669816023
HOUNSLOW,51.31550103034914,-0.37851470463324743
ISLE OF ANGLESEY,53.27637540915653,-4.323495411729392
ISLE OF WIGHT,50.62684579406237,-1.3335589426514434
ISLES OF SCILLY,49.923857744201605,-6.302263516809768
ISLINGTON,51.66454658738323,-0.10992970115558956
KENSINGTON AND CHELSEA,51.49977592399342,-0.18981078381787103
KENT,51.066980402556894,0.72177006521006
"KINGSTON UPON HULL, CITY OF",53.894135701816644,-0.30380941990063115
KINGSTON UPON THAMES,51.42789080754545,-0.28368404321251495
KIRKLEES,53.84779145117579,-1.7808194218728275
KNOWSLEY,53.48284092504563,-2.8329791954991275
LAMBETH,51.252923290285565,-0.11380231585035454
LANCASHIRE,53.39410422518683,-2.460896340904076
LEEDS,53.55494339794778,-1.5074406609781625
LEICESTER,52.7035904712036,-1.1304165681356237
LEICESTERSHIRE,52.372384242153444,-1.3774821236258858
LEWISHAM,51.26146486742923,-0.017302263531446847
LINCOLNSHIRE,53.019325697607805,-0.23840017404638325
LIVERPOOL,53.51161042331058,-2.9133522899513755
LUTON,51.96794156247519,-0.4231450525783596
MANCHESTER,53.618174414336764,-2.2337215842169944
MEDWAY,51.32754494250598,0.5632336335498731
MERTHYR TYDFIL,51.749169200604825,-3.36403864047987
MERTON,51.37364806533906,-0.18868296177359278
MIDDLESBROUGH,54.5098082464691,-1.211038279554591
MILTON KEYNES,52.01693552290149,-0.7406232665194876
MONMOUTHSHIRE,51.78143655329183,-2.9039386644643197
NEATH PORT TALBOT,51.59538437854254,-3.7458617902677283
NEWCASTLE UPON TYNE,55.00208530426788,-1.652806624671881
NEWHAM,51.75154898367921,0.027418339450078835
NEWPORT,51.53253056059282,-2.8977514562758477
NORFOLK,52.3032223796034,0.9647662889518414
NORTH EAST LINCOLNSHIRE,53.50967645052903,-0.13922750148994814
NORTH LINCOLNSHIRE,53.57540769163687,-0.5237063875323392
NORTH SOMERSET,51.35265217208383,-2.754333708085771
NORTH TYNESIDE,55.00390319683472,-1.5092377782362794
NORTH YORKSHIRE,54.037083506236726,-1.5496083229591298
NORTHAMPTONSHIRE,52.090056204873584,-0.8673643733062965
NORTHUMBERLAND,55.268382697315424,-2.075107564148198
NOTTINGHAM,52.95517248670217,-1.166635297324727
NOTTINGHAMSHIRE,53.03298887412134,-1.006945929298795
OLDHAM,53.659965283524954,-2.052688245629671
OXFORDSHIRE,51.93769526591072,-1.2911207463303098
PEMBROKESHIRE,51.87232817560273,-4.908191395785854
PETERBOROUGH,52.62511626981561,-0.2689975241368676
PLYMOUTH,50.29446598251615,-4.112955625237552
PORTSMOUTH,50.91433206435089,-1.0702659081823802
POWYS,52.35028728472521,-3.4364646802117074
READING,51.48972751726377,-0.9907195716377762
REDBRIDGE,51.74619394585629,0.0701000048233879
REDCAR AND CLEVELAND,54.52674848959172,-1.0057471172413288
RICHMOND UPON THAMES,51.40228740909276,-0.28924251316631455
ROCHDALE,53.67734692115036,-2.14815188340053
ROTHERHAM,53.27571588878268,-1.2866084213986422
RUTLAND,52.66741819281054,-0.6255844565552813
SALFORD,53.39900474827836,-2.3848977331687684
SANDWELL,52.58696674791831,-2.007627650605722
SEFTON,53.41754419091054,-2.9918998460398845
SHEFFIELD,53.594572416421464,-1.5427564265432459
SHROPSHIRE,52.68421414164122,-2.7366875706426375
SLOUGH,51.500375556628576,-0.5761037634462686
SOLIHULL,52.36591301434561,-1.7157174664625492
SOMERSET,51.15203995716832,-3.2953379430424437
SOUTH GLOUCESTERSHIRE,51.619868102630875,-2.469430184260059
SOUTH TYNESIDE,54.994706019365786,-1.4469508035803413
SOUTHAMPTON,50.984805930473584,-1.4002768042215858
SOUTHEND-ON-SEA,51.562157807336284,0.7069905953535786
SOUTHWARK,51.26247572937943,-0.07306483663823536
ST. HELENS,53.442240723358644,-2.7032424159534347
STAFFORDSHIRE,52.54946704767607,-2.027491119365553
STOCKPORT,53.243567817667724,-2.1248973952531918
STOCKTON-ON-TEES,54.60356568786033,-1.3063893005278557
STOKE-ON-TRENT,53.0018684063432,-2.1588155163720084
SUFFOLK,52.07327606663186,1.049040133490474
SUNDERLAND,54.95658521287448,-1.433572135990224
SURREY,51.75817482314145,-0.3386369800762059
SUTTON,51.33189096687447,-0.17228958486126392
SWANSEA,51.734320352502984,-3.967180818043868
SWINDON,51.64295753076632,-1.7336382187066433
TAMESIDE,53.4185402114593,-2.0769462404028474
TELFORD AND WREKIN,52.709149095326744,-2.4894724871905916
THURROCK,51.508227793073466,0.33492786371540356
TORBAY,50.494049197230815,-3.5551646045072913
TORFAEN,51.69896506141925,-3.0509328418360218
TOWER HAMLETS,51.68485859523772,-0.03638140322291906
TRAFFORD,53.314621144815334,-2.3656560688750687
VALE OF GLAMORGAN,51.477096810804674,-3.3980039155600954
WAKEFIELD,53.81677380462442,-1.4208545508030999
WALSALL,52.742742908764974,-1.9703315889024553
WALTHAM FOREST,51.723501987712325,-0.01886180175957716
WANDSWORTH,51.24653418036352,-0.2001743797936436
WARRINGTON,53.338554119123636,-2.561564052456012
WARWICKSHIRE,52.04847200574421,-1.5686356193411675
WEST BERKSHIRE,51.472960442069805,-1.2740171035533379
WEST SUSSEX,51.11473921001523,-0.4593527537340543
WESTMINSTER,51.613346179755915,-0.15298252171750404
WIGAN,53.58763891955546,-2.5723844100365545
WILTSHIRE,51.48575283497703,-1.926537553406791
WINDSOR AND MAIDENHEAD,51.494612540256846,-0.6753936432282348
WIRRAL,53.237217504292545,-3.0650813262796417
WOKINGHAM,51.45966460093226,-0.8993706058495408
WOLVERHAMPTON,52.71684834050869,-2.127594624973283
WORCESTERSHIRE,52.05799103802506,-2.209184250840713
WREXHAM,53.00080440180421,-2.991958507191866
YORK,53.99232942499273,-1.073788787620359
1 county lat_county_center long_county_center
2 BARKING AND DAGENHAM 51.621048311776526 0.12958319845588165
3 BARNET 51.81255163972051 -0.21821206632197684
4 BARNSLEY 53.57190690010971 -1.5487193565226611
5 BATH AND NORTH EAST SOMERSET 51.35496548780361 -2.486675162410336
6 BEDFORD 52.145475839485385 -0.4549734374180617
7 BEXLEY 51.33625605642689 0.14633321710015448
8 BIRMINGHAM 52.12178304394528 -1.881329432771379
9 BLACKBURN WITH DARWEN 53.63718763008419 -2.463700844959783
10 BLACKPOOL 53.882118373353435 -3.0229009637127167
11 BLAENAU GWENT 51.75159582861159 -3.1862426125686745
12 BOLTON 53.73813128127497 -2.4794091133678147
13 BRACKNELL FOREST 51.457925145468295 -0.7336441271286038
14 BRADFORD 53.972113267048044 -1.8738762931122748
15 BRENT 51.761695309784 -0.2756927203781798
16 BRIDGEND 51.522888539164526 -3.6137468421270604
17 BRIGHTON AND HOVE 50.94890407892698 -0.1507807253912774
18 BRISTOL, CITY OF 51.53203785026057 -2.5774864859032594
19 BROMLEY 51.2251371203518 0.03905163114984023
20 BUCKINGHAMSHIRE 51.92925587759856 -0.8053996183750294
21 BURY 53.61553432785575 -2.3088650595977023
22 CAERPHILLY 51.62781255006381 -3.1973649865483735
23 CALDERDALE 53.769761331289686 -1.9616103771384508
24 CAMBRIDGESHIRE 52.1333820427886 -0.23503728806014595
25 CAMDEN 51.69346289078886 -0.1629412552292679
26 CARDIFF 51.56635588939404 -3.222317281083218
27 CARMARTHENSHIRE 51.92106862577838 -4.211293704149962
28 CENTRAL BEDFORDSHIRE 51.99983427713095 -0.4775810785914261
29 CEREDIGION 52.297905934896974 -3.9524382809074967
30 CHESHIRE EAST 53.209779668583735 -2.2923524120906538
31 CHESHIRE WEST AND CHESTER 53.12468649229667 -2.703640874356098
32 CITY OF LONDON 51.515869084539396 -0.09345024349003202
33 CONWY 53.125451225027945 -3.7469275629154897
34 CORNWALL 50.2491094902892 -4.642072961722217
35 COUNTY DURHAM 54.46928915708376 -1.840983172985692
36 COVENTRY 52.20619163815314 -1.5190329484575433
37 CROYDON 51.33122440611814 -0.07773715861848832
38 CUMBRIA 54.470582575648244 -2.902600383252353
39 DARLINGTON 54.51355967194039 -1.5680201999230523
40 DENBIGHSHIRE 53.07313542431554 -3.347662396412462
41 DERBY 52.98317870391253 -1.471762916352353
42 DERBYSHIRE 52.96237103431297 -1.6019383162802616
43 DEVON 50.75993290464059 -3.6572707805745353
44 DONCASTER 53.579077870304175 -1.1091519021581622
45 DORSET 50.80117614559981 -2.4141088997141975
46 DUDLEY 52.466075739334926 -2.101688961593882
47 EALING 51.69946371446451 -0.31413253292570953
48 EAST RIDING OF YORKSHIRE 53.9506321883079 -0.6619808168243948
49 EAST SUSSEX 50.8319515317622 0.33441692286193403
50 ENFIELD 51.79829813489722 -0.08133941451400101
51 ESSEX 51.61177562858481 0.5408806396014519
52 FLINTSHIRE 53.18448452051185 -3.176529270275655
53 GATESHEAD 54.984104331680726 -1.6867966327256207
54 GLOUCESTERSHIRE 51.95116469210396 -2.152140175011601
55 GREENWICH 51.298529627584855 0.05009798110429057
56 GWYNEDD 52.90798692199907 -3.815807248465912
57 HACKNEY 51.715573990309835 -0.06047668080560671
58 HALTON 53.37945371869939 -2.6885285111965866
59 HAMMERSMITH AND FULHAM 51.45669431471315 -0.21734862391196488
60 HAMPSHIRE 51.35882747857323 -1.2472236572124424
61 HARINGEY 51.71488485869694 -0.10670896820865851
62 HARROW 51.69502976226169 -0.3360141730528605
63 HARTLEPOOL 54.67019690697325 -1.2702881849113061
64 HAVERING 51.68803382335829 0.23538931286606415
65 HEREFORDSHIRE, COUNTY OF 52.05661428266539 -2.7394973894756567
66 HERTFORDSHIRE 51.97545351306396 -0.2768104374496038
67 HILLINGDON 51.67744993832507 -0.44168376669816023
68 HOUNSLOW 51.31550103034914 -0.37851470463324743
69 ISLE OF ANGLESEY 53.27637540915653 -4.323495411729392
70 ISLE OF WIGHT 50.62684579406237 -1.3335589426514434
71 ISLES OF SCILLY 49.923857744201605 -6.302263516809768
72 ISLINGTON 51.66454658738323 -0.10992970115558956
73 KENSINGTON AND CHELSEA 51.49977592399342 -0.18981078381787103
74 KENT 51.066980402556894 0.72177006521006
75 KINGSTON UPON HULL, CITY OF 53.894135701816644 -0.30380941990063115
76 KINGSTON UPON THAMES 51.42789080754545 -0.28368404321251495
77 KIRKLEES 53.84779145117579 -1.7808194218728275
78 KNOWSLEY 53.48284092504563 -2.8329791954991275
79 LAMBETH 51.252923290285565 -0.11380231585035454
80 LANCASHIRE 53.39410422518683 -2.460896340904076
81 LEEDS 53.55494339794778 -1.5074406609781625
82 LEICESTER 52.7035904712036 -1.1304165681356237
83 LEICESTERSHIRE 52.372384242153444 -1.3774821236258858
84 LEWISHAM 51.26146486742923 -0.017302263531446847
85 LINCOLNSHIRE 53.019325697607805 -0.23840017404638325
86 LIVERPOOL 53.51161042331058 -2.9133522899513755
87 LUTON 51.96794156247519 -0.4231450525783596
88 MANCHESTER 53.618174414336764 -2.2337215842169944
89 MEDWAY 51.32754494250598 0.5632336335498731
90 MERTHYR TYDFIL 51.749169200604825 -3.36403864047987
91 MERTON 51.37364806533906 -0.18868296177359278
92 MIDDLESBROUGH 54.5098082464691 -1.211038279554591
93 MILTON KEYNES 52.01693552290149 -0.7406232665194876
94 MONMOUTHSHIRE 51.78143655329183 -2.9039386644643197
95 NEATH PORT TALBOT 51.59538437854254 -3.7458617902677283
96 NEWCASTLE UPON TYNE 55.00208530426788 -1.652806624671881
97 NEWHAM 51.75154898367921 0.027418339450078835
98 NEWPORT 51.53253056059282 -2.8977514562758477
99 NORFOLK 52.3032223796034 0.9647662889518414
100 NORTH EAST LINCOLNSHIRE 53.50967645052903 -0.13922750148994814
101 NORTH LINCOLNSHIRE 53.57540769163687 -0.5237063875323392
102 NORTH SOMERSET 51.35265217208383 -2.754333708085771
103 NORTH TYNESIDE 55.00390319683472 -1.5092377782362794
104 NORTH YORKSHIRE 54.037083506236726 -1.5496083229591298
105 NORTHAMPTONSHIRE 52.090056204873584 -0.8673643733062965
106 NORTHUMBERLAND 55.268382697315424 -2.075107564148198
107 NOTTINGHAM 52.95517248670217 -1.166635297324727
108 NOTTINGHAMSHIRE 53.03298887412134 -1.006945929298795
109 OLDHAM 53.659965283524954 -2.052688245629671
110 OXFORDSHIRE 51.93769526591072 -1.2911207463303098
111 PEMBROKESHIRE 51.87232817560273 -4.908191395785854
112 PETERBOROUGH 52.62511626981561 -0.2689975241368676
113 PLYMOUTH 50.29446598251615 -4.112955625237552
114 PORTSMOUTH 50.91433206435089 -1.0702659081823802
115 POWYS 52.35028728472521 -3.4364646802117074
116 READING 51.48972751726377 -0.9907195716377762
117 REDBRIDGE 51.74619394585629 0.0701000048233879
118 REDCAR AND CLEVELAND 54.52674848959172 -1.0057471172413288
119 RICHMOND UPON THAMES 51.40228740909276 -0.28924251316631455
120 ROCHDALE 53.67734692115036 -2.14815188340053
121 ROTHERHAM 53.27571588878268 -1.2866084213986422
122 RUTLAND 52.66741819281054 -0.6255844565552813
123 SALFORD 53.39900474827836 -2.3848977331687684
124 SANDWELL 52.58696674791831 -2.007627650605722
125 SEFTON 53.41754419091054 -2.9918998460398845
126 SHEFFIELD 53.594572416421464 -1.5427564265432459
127 SHROPSHIRE 52.68421414164122 -2.7366875706426375
128 SLOUGH 51.500375556628576 -0.5761037634462686
129 SOLIHULL 52.36591301434561 -1.7157174664625492
130 SOMERSET 51.15203995716832 -3.2953379430424437
131 SOUTH GLOUCESTERSHIRE 51.619868102630875 -2.469430184260059
132 SOUTH TYNESIDE 54.994706019365786 -1.4469508035803413
133 SOUTHAMPTON 50.984805930473584 -1.4002768042215858
134 SOUTHEND-ON-SEA 51.562157807336284 0.7069905953535786
135 SOUTHWARK 51.26247572937943 -0.07306483663823536
136 ST. HELENS 53.442240723358644 -2.7032424159534347
137 STAFFORDSHIRE 52.54946704767607 -2.027491119365553
138 STOCKPORT 53.243567817667724 -2.1248973952531918
139 STOCKTON-ON-TEES 54.60356568786033 -1.3063893005278557
140 STOKE-ON-TRENT 53.0018684063432 -2.1588155163720084
141 SUFFOLK 52.07327606663186 1.049040133490474
142 SUNDERLAND 54.95658521287448 -1.433572135990224
143 SURREY 51.75817482314145 -0.3386369800762059
144 SUTTON 51.33189096687447 -0.17228958486126392
145 SWANSEA 51.734320352502984 -3.967180818043868
146 SWINDON 51.64295753076632 -1.7336382187066433
147 TAMESIDE 53.4185402114593 -2.0769462404028474
148 TELFORD AND WREKIN 52.709149095326744 -2.4894724871905916
149 THURROCK 51.508227793073466 0.33492786371540356
150 TORBAY 50.494049197230815 -3.5551646045072913
151 TORFAEN 51.69896506141925 -3.0509328418360218
152 TOWER HAMLETS 51.68485859523772 -0.03638140322291906
153 TRAFFORD 53.314621144815334 -2.3656560688750687
154 VALE OF GLAMORGAN 51.477096810804674 -3.3980039155600954
155 WAKEFIELD 53.81677380462442 -1.4208545508030999
156 WALSALL 52.742742908764974 -1.9703315889024553
157 WALTHAM FOREST 51.723501987712325 -0.01886180175957716
158 WANDSWORTH 51.24653418036352 -0.2001743797936436
159 WARRINGTON 53.338554119123636 -2.561564052456012
160 WARWICKSHIRE 52.04847200574421 -1.5686356193411675
161 WEST BERKSHIRE 51.472960442069805 -1.2740171035533379
162 WEST SUSSEX 51.11473921001523 -0.4593527537340543
163 WESTMINSTER 51.613346179755915 -0.15298252171750404
164 WIGAN 53.58763891955546 -2.5723844100365545
165 WILTSHIRE 51.48575283497703 -1.926537553406791
166 WINDSOR AND MAIDENHEAD 51.494612540256846 -0.6753936432282348
167 WIRRAL 53.237217504292545 -3.0650813262796417
168 WOKINGHAM 51.45966460093226 -0.8993706058495408
169 WOLVERHAMPTON 52.71684834050869 -2.127594624973283
170 WORCESTERSHIRE 52.05799103802506 -2.209184250840713
171 WREXHAM 53.00080440180421 -2.991958507191866
172 YORK 53.99232942499273 -1.073788787620359

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

48060
5/data science/2e/worldcities.csv Executable file

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

2530339
5/data science/3/1_02_EDA.ipynb Normal file

File diff suppressed because one or more lines are too long

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,776 @@
{
"cells": [
{
"cell_type": "markdown",
"id": "0bf7f930-76a1-4c16-84e4-cf1e73b54c55",
"metadata": {},
"source": [
"<a href=\"https://www.nvidia.com/dli\"> <img src=\"images/DLI_Header.png\" alt=\"Header\" style=\"width: 400px;\"/> </a>"
]
},
{
"cell_type": "markdown",
"id": "400a41da-bc38-4e9a-9ece-d2744ffb16b0",
"metadata": {
"tags": []
},
"source": [
"# Enhancing Data Science Outcomes With Efficient Workflow #"
]
},
{
"cell_type": "markdown",
"id": "8897c66c-4f9d-48b4-a60b-ddae16f2f61b",
"metadata": {},
"source": [
"## 04 - Embeddings ##\n",
"In this lab, you will use high-performance computing to create machine learning solutions. This lab covers the model development portion of the data science workflow. A good machine learning solution excels that both accuracy and inference performance. \n",
"\n",
"<p><img src='images/pipeline_overview_2.png' width=1080></p>\n",
"\n",
"**Table of Contents**\n",
"<br>\n",
"This notebook covers the below sections: \n",
"1. [Entity Embedding](#s4-1)\n",
"2. [Training the Embeddings](#s4-2)\n",
" * [Preparing the Data - Normalization](#s4-2.1)\n",
" * [Model Building](#s4-2.2)\n",
" * [Being Training](#s4-2.3)\n",
"3. [Visualizing the Embeddings](#s4-3)\n",
"4. [Conclusion](#s4-4)"
]
},
{
"cell_type": "markdown",
"id": "28538773-6b95-4840-aca2-73a6f7d98b07",
"metadata": {},
"source": [
"<a name='s4-1'></a>\n",
"## Entity Embeddings ##\n",
"[Entity Embeddings](https://arxiv.org/pdf/1604.06737.pdf) are very similar to word embeddings used in NLP. They are a way to represent categorical features in a defined latent space. In the latent space, categories that are semantically similar have similar vectors. Embeddings can be trained to assign a learnable feature vector to each category. Using embeddings, each categorical value is mapped to its own associated vector representation that is more informative than a single point value. Even though embeddings require a large amount of data and computational resources to train, they have proven to be a great alternative encoding method to consider. Once trained, embeddings can boost the performance of downstream machine learning tasks when used as the input features. Users can combine the power of deep learning with traditional machine learning on tabular data. \n",
"\n",
"<p><img src='images/embedding.png' width=720></p>\n",
"\n",
"Reasons for using embeddings include: \n",
"* It is much more efficient than the one-hot approach for encoding when cardinality if high\n",
"* Allows rich relationships and complexities between categories to be captured\n",
"* Reduce memory usage and speed up downstream machine learning model training\n",
"* Once trained, the same embedding can be used for various use cases\n",
"* Can be used to visualize categorical data and for data clustering, since the embedding space quantifies semantic similarity as distance between the categories in the latent space\n",
"* Mitigates the need to perform cumbersome manual feature engineering, which requires extensive domain knowledge\n",
"\n",
"<p><img src='images/tip.png' width=720></p>\n",
"\n",
"Below are some tips about embeddings: \n",
"* Requires training with large amounts of data, making it inappropriate for unseen data such as when new categories are added\n",
"* Can overfit\n",
"* Difficult to interpret"
]
},
{
"cell_type": "markdown",
"id": "6ba4160d-4b41-40d3-93bc-f1fae0b9dddc",
"metadata": {},
"source": [
"<a name='s4-2'></a>\n",
"## Training the Embeddings ##\n",
"Embeddings aim to represent each entity as a numeric vector such that products in similar context have similar vectors. Mathematically, similar entities will have a large dot product whereas every entity when one-hot encoded has a zero dot product with every other entity. This is because all one-hot vectors are orthogonal. \n",
"\n",
"We will use [PyTorch](https://pytorch.org/) to train a simple fully-connected neural network. A surrogate problem is setup for the purpose of finding the embedding vectors. Neural networks have difficultly with sparse categorical features. Traditionally, embeddings are a way to reduce those features to increase model performance. \n",
"\n",
"Technically, the idea of an embedding layer is very similar to a dense or linear layer (without bias) in the neural network. When training an embedding this way, users will one-hot encode the categorical data so each record becomes a vector with C features, where C is the cardinality. We then perform matrix vector multiplication on the input vector and the weights before feeding the next layer. This is inefficient when the number of input features is large and sparse, as is the case for categorical features from a tabular dataset. \n",
"\n",
"A better and more efficient approach would be to train a `torch.nn.Embedding` layer, which can be treated as a \"lookup\" table with the label-encoded category id as the index. By using choosing this, we avoid one-hot encoding and the matrix vector multiplication. \n",
"\n",
"<p><img src='images/surrogate_problem.png' width=720></p>\n",
"\n",
"<p><img src='images/tip.png' width=720></p>\n",
"\n",
"Embeddings will naturally be affected by how the surrogate problem is defined. "
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "ec50a570-247f-4cfc-8dc5-2c2b501de703",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# import dependencies\n",
"from tqdm import tqdm\n",
"import cudf\n",
"import cuml\n",
"import dask_cudf\n",
"\n",
"import torch\n",
"import torch.nn as nn\n",
"import torch.nn.functional as F\n",
"import torch.optim as torch_optim\n",
"from torch.utils.data import Dataset, DataLoader"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "036bf6ee-d5cb-4f20-a591-681706a098ac",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [],
"source": [
"# set device cuda to use GPU\n",
"device=torch.device('cuda')"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "3726fc69-2a2b-42e2-be12-d235ce2322c1",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# define features and label\n",
"cols=['brand', 'cat_0', 'cat_1', 'cat_2', 'price', 'target']\n",
"cat_cols=['brand', 'cat_0', 'cat_1', 'cat_2']\n",
"label='target'\n",
"\n",
"feature_cols=[col for col in cols if col != label]\n",
"cont_cols=[col for col in feature_cols if col not in cat_cols] # ['price']"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "ae87d23f-0c67-4758-8842-ca5770e740f9",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Total of 2461697 records.\n"
]
}
],
"source": [
"# read data\n",
"parquet_dir='processed_parquet'\n",
"\n",
"ddf=dask_cudf.read_parquet(parquet_dir, columns=cols)\n",
"gdf=ddf.compute()\n",
"\n",
"print(f'Total of {len(gdf)} records.')"
]
},
{
"cell_type": "markdown",
"id": "b9110c9d-5924-4cb2-8bf3-cabd398aad0e",
"metadata": {},
"source": [
"<p><img src='images/tip.png' width=720></p>\n",
"\n",
"Even though we intend to keep all the data in one GPU, we still recommend loading data with `Dask-cuDF`. "
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "f782bc7e-e6c4-4d87-a839-5a99227dca7c",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>brand</th>\n",
" <th>cat_0</th>\n",
" <th>cat_1</th>\n",
" <th>cat_2</th>\n",
" <th>price</th>\n",
" <th>target</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>100.229996</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>871.839966</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>1</td>\n",
" <td>872.090027</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>2</td>\n",
" <td>6</td>\n",
" <td>5</td>\n",
" <td>2</td>\n",
" <td>306.690002</td>\n",
" <td>1</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>13</td>\n",
" <td>2</td>\n",
" <td>3</td>\n",
" <td>24</td>\n",
" <td>334.349976</td>\n",
" <td>1</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" brand cat_0 cat_1 cat_2 price target\n",
"0 1 6 5 2 100.229996 1\n",
"1 2 1 1 1 871.839966 1\n",
"2 2 1 1 1 872.090027 1\n",
"3 2 6 5 2 306.690002 1\n",
"4 13 2 3 24 334.349976 1"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"gdf.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "3673f202-7aea-43a7-a569-4c210a614529",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"{'brand': (3303, 7), 'cat_0': (14, 3), 'cat_1': (61, 3), 'cat_2': (90, 3)}"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# the embedding vectors will start with 0 so we decrease the categorical values by 1 to match\n",
"gdf[cat_cols]=gdf[cat_cols]-1\n",
"\n",
"n_uniques=gdf.nunique()\n",
"\n",
"# use higher of 4th root of nunique and 3 for vector dimension\n",
"embedding_sizes={col: (n_uniques[col], max(3, int(n_uniques[col]**0.25))) for col in cat_cols}\n",
"embedding_sizes"
]
},
{
"cell_type": "markdown",
"id": "a327c1f9-0683-45f1-90a6-6d4d4daa093c",
"metadata": {
"tags": []
},
"source": [
"<p><img src='images/tip.png' width=720></p>\n",
"\n",
"The size of embeddings can become very large. For example, large embeddings are usually needed for users and items for large platforms. "
]
},
{
"cell_type": "markdown",
"id": "2c1c7fee-dad0-4009-a55c-513465db8a7c",
"metadata": {},
"source": [
"<a name='s4-2.1'></a>\n",
"### Preparing the Data - Normalization ###\n",
"**Normalization** is required to enable neural networks to leverage numerical features. Tree-based models do not require normalization as they define the split independent of the scale of a feature. Without normalization, neural networks are difficult to train. The reason is that different numerical features have different scales. When we combine the features in a hidden layer, the different scales make it more difficult to extract patterns from it. \n",
"\n",
"<p><img src='images/tip.png' width=720></p>\n",
"\n",
"We will also implement a `torch.nn.BatchNorm1d`[[doc]](https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html) layer to mitigate the exploding gradient problem. "
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "fb1840b3-a7d8-4b91-98ef-bddf59afd5e6",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [],
"source": [
"# normalize data\n",
"gdf['price']=cuml.preprocessing.StandardScaler().fit_transform(gdf[['price']])"
]
},
{
"cell_type": "markdown",
"id": "d6991948-f79a-4b51-b3a9-2571b2be5262",
"metadata": {
"tags": []
},
"source": [
"<a name='s4-2.2'></a>\n",
"### Model Building ###\n",
"We construct a model with several layers. The embeddings will be the same dimension as num_unique x vector_size. The embeddings will be concatenated, along with the continous variable(s), before they are fed into the next layer. "
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "35a8055b-8b7b-4fb8-8d3a-9f36fc03b171",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [],
"source": [
"# define neural network with embedding layers\n",
"class ProductPurchaseModel(nn.Module):\n",
" def __init__(self, embedding_sizes, n_cont):\n",
" super().__init__()\n",
" # make an embedding for each categorical feature\n",
" # The `nn.Embedding` layer can be thought of as a lookup table where the key is \n",
" # the category index and the value is the corresponding embedding vector\n",
" self.embeddings=nn.ModuleList([nn.Embedding(n_categories, size) for n_categories, size in embedding_sizes.values()])\n",
" \n",
" # n_emb is the length of all embeddings combined\n",
" n_emb=sum(e.embedding_dim for e in self.embeddings)\n",
" \n",
" self.n_emb=n_emb\n",
" self.n_cont=n_cont\n",
" self.emb_drop = nn.Dropout(0.6)\n",
" \n",
" # apply dropout, batch norm and linear layers\n",
" self.bn1=nn.BatchNorm1d(self.n_cont)\n",
" self.lin1=nn.Linear(self.n_emb + self.n_cont, 200)\n",
" self.drop1=nn.Dropout(0.3)\n",
" self.bn2=nn.BatchNorm1d(200)\n",
" self.drop2=nn.Dropout(0.3)\n",
" self.lin2=nn.Linear(200, 70)\n",
" self.bn3=nn.BatchNorm1d(70)\n",
" self.lin3=nn.Linear(70, 2)\n",
"\n",
" def forward(self, X_cat, X_cont):\n",
" # map each categorical feature to the embedding vector on its corresponding embedding layer\n",
" x_1=[embedding(X_cat[:, idx]) for idx, embedding in enumerate(self.embeddings)]\n",
" \n",
" # concatenate all categorical embedding vectors together\n",
" x_1=torch.cat(x_1, 1)\n",
" \n",
" # apply random drop out, normalization, and activation\n",
" x_1=self.emb_drop(x_1)\n",
" x_2=self.bn1(X_cont)\n",
" \n",
" # concatenate categorical embeddings to input layer from continuous variable(s)\n",
" x_1=torch.cat([x_1, x_2], 1)\n",
" \n",
" # apply random drop out, normalization, and activation\n",
" x_1=F.relu(self.lin1(x_1))\n",
" x_1=self.drop1(x_1)\n",
" x_1=self.bn2(x_1)\n",
" x_1=F.relu(self.lin2(x_1))\n",
" x_1=self.drop2(x_1)\n",
" x_1=self.bn3(x_1)\n",
" x_1=self.lin3(x_1)\n",
" return x_1"
]
},
{
"cell_type": "markdown",
"id": "c52e50a2-99b6-4a8c-aa65-5f11a7806c6e",
"metadata": {},
"source": [
"<p><img src='images/tip.png' width=720></p>\n",
"\n",
"Tabular data uses shallow models with huge embedding tables and few feed-forward layers. "
]
},
{
"cell_type": "code",
"execution_count": 9,
"id": "5b7d18b1-d29e-43d4-8091-3aba41968ebf",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [
{
"data": {
"text/plain": [
"ProductPurchaseModel(\n",
" (embeddings): ModuleList(\n",
" (0): Embedding(3303, 7)\n",
" (1): Embedding(14, 3)\n",
" (2): Embedding(61, 3)\n",
" (3): Embedding(90, 3)\n",
" )\n",
" (emb_drop): Dropout(p=0.6, inplace=False)\n",
" (bn1): BatchNorm1d(1, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (lin1): Linear(in_features=17, out_features=200, bias=True)\n",
" (drop1): Dropout(p=0.3, inplace=False)\n",
" (bn2): BatchNorm1d(200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (drop2): Dropout(p=0.3, inplace=False)\n",
" (lin2): Linear(in_features=200, out_features=70, bias=True)\n",
" (bn3): BatchNorm1d(70, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)\n",
" (lin3): Linear(in_features=70, out_features=2, bias=True)\n",
")"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# instantiate model\n",
"model=ProductPurchaseModel(embedding_sizes, len(cont_cols))\n",
"model.to(device)"
]
},
{
"cell_type": "markdown",
"id": "f35dab8e-f1cd-484b-999e-b9e0f7e79edd",
"metadata": {},
"source": [
"Next, we define a `torch.utils.data.Dataset` class to be use by `torch.utils.data.DataLoader`. The Dataset is makes it easier to track separate categorical and continuous variables. The DatalLoader wraps an iterable around the Dataset to enable easy access to the samples. More information about Dataset and DataLoader can be found in quick PyTorch [guide](https://pytorch.org/tutorials/beginner/basics/data_tutorial.html). "
]
},
{
"cell_type": "code",
"execution_count": 10,
"id": "98f74906-7b79-4fda-8626-df17023ee512",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [],
"source": [
"# define dataset\n",
"class myDataset(Dataset):\n",
" def __init__(self, X, y, cat_cols, cont_cols):\n",
" self.X_cat=torch.as_tensor(X.loc[:, cat_cols].copy().values.astype('int32'), device=device)\n",
" self.X_cont=torch.as_tensor(X.loc[:, cont_cols].copy().values.astype('float32'), device=device)\n",
" self.y=torch.as_tensor(y.astype('int64'), device=device)\n",
" \n",
" def __len__(self):\n",
" return len(self.y)\n",
" \n",
" def __getitem__(self, idx): \n",
" return self.X_cat[idx], self.X_cont[idx], self.y[idx]"
]
},
{
"cell_type": "code",
"execution_count": 11,
"id": "a0973509-6a11-49d8-b346-ab9ec8cfaef5",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [],
"source": [
"# instantiate dataset\n",
"X_train=gdf[feature_cols]\n",
"y_train=gdf['target'].values\n",
"\n",
"train_ds=myDataset(X_train, y_train, cat_cols, cont_cols)"
]
},
{
"cell_type": "markdown",
"id": "5336cfd0-39ed-4285-9b66-e4f5d1b7d75e",
"metadata": {},
"source": [
"<a name='s4-2.3'></a>\n",
"### Begin Training ###\n",
"We will set some parameters for training. "
]
},
{
"cell_type": "code",
"execution_count": 12,
"id": "0604708e-1c2c-485b-a029-eadd17356a03",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [],
"source": [
"# set optimizer\n",
"def get_optimizer(model, lr = 0.001, wd = 0.0):\n",
" parameters=filter(lambda p: p.requires_grad, model.parameters())\n",
" optim=torch_optim.Adam(parameters, lr=lr, weight_decay=wd)\n",
" return optim"
]
},
{
"cell_type": "code",
"execution_count": 13,
"id": "39e0ce25-f65c-4330-98cc-34ee4b30bae4",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [],
"source": [
"# define training function\n",
"def train_model(model, optim, train_dl):\n",
" # set the model to training, which is useful for BatchNorm and Dropout layers that behave differently during training and evaluation\n",
" model.train()\n",
" total=0\n",
" sum_loss=0\n",
" \n",
" # iterate through batches\n",
" for b, (X_cat, X_cont, y) in enumerate(train_dl):\n",
" batch=y.shape[0]\n",
" \n",
" # forward pass\n",
" output=model(X_cat, X_cont)\n",
" \n",
" # calculate loss\n",
" loss=F.cross_entropy(output, y)\n",
" \n",
" # zero out the gradients so the parameters update correctly, otherwise gradients would be combined with old\n",
" optim.zero_grad()\n",
" loss.backward()\n",
" optim.step()\n",
" \n",
" # calculate total loss per batch\n",
" total+=batch\n",
" sum_loss+=batch*(loss.item())\n",
" return sum_loss/total"
]
},
{
"cell_type": "markdown",
"id": "a60dd511-3121-4eb0-beb7-3a03d56de202",
"metadata": {},
"source": [
"Instantiate a `torch.utils.data.DataLoader` and begin training. "
]
},
{
"cell_type": "code",
"execution_count": 14,
"id": "5a25e4e6-f0b5-4bbc-8a1d-0eee74c7faaf",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# define training loop\n",
"def train_loop(model, epochs, lr=0.01, wd=0.0):\n",
" # instantiate optimizer\n",
" optim=get_optimizer(model, lr = lr, wd = wd)\n",
" \n",
" # iterate through number of epochs\n",
" for i in tqdm(range(epochs)): \n",
" loss=train_model(model, optim, train_dl)\n",
" print(\"training loss: \", round(loss, 3))"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "68b43459-fb0a-4c13-9371-7c15327ff624",
"metadata": {
"scrolled": true,
"tags": []
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
" 33%|███▎ | 1/3 [00:28<00:57, 28.79s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"training loss: 0.666\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
" 67%|██████▋ | 2/3 [00:57<00:28, 28.67s/it]"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"training loss: 0.665\n"
]
}
],
"source": [
"%%time\n",
"\n",
"# define batch size and begin training\n",
"batch_size=1000\n",
"train_dl=DataLoader(train_ds, batch_size=batch_size, shuffle=True)\n",
"\n",
"train_loop(model, epochs=3, lr=0.05, wd=0.00001)"
]
},
{
"cell_type": "markdown",
"id": "7d6656b3-3642-4279-b787-0c034c45b739",
"metadata": {},
"source": [
"<a name='s4-3'></a>\n",
"## Visualizing the Embeddings ##"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "20973ee4-a723-4931-bf50-8efffe275026",
"metadata": {
"tags": []
},
"outputs": [],
"source": [
"# visualize embeddings\n",
"\n",
"# import dependencies\n",
"import plotly.express as px\n",
"import pandas as pd\n",
"\n",
"# pick category to visualize\n",
"category='brand'\n",
"\n",
"category_label=pd.read_parquet(f'categories/unique.{category}.parquet')[category]\n",
"category_label=category_label[1:]\n",
"\n",
"embeddings_idx=list(embedding_sizes.keys()).index(category)\n",
"embeddings=model.embeddings[embeddings_idx].weight.detach().cpu().numpy()\n",
"\n",
"fig=px.scatter_3d(\n",
" x=embeddings[:, 0], \n",
" y=embeddings[:, 1], \n",
" z=embeddings[:, 2], \n",
" text=category_label, \n",
" height=720\n",
")\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "130a2b16-89e5-4eda-8155-014a75a3638e",
"metadata": {},
"outputs": [],
"source": [
"# persist embeddings\n",
"!mkdir trained_embedding_weights\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"for idx, each_col in enumerate(cat_cols): \n",
" weights=model.embeddings[idx].weight.detach().cpu().numpy()\n",
" pd.DataFrame(weights).to_csv(f'trained_embedding_weights/{each_col}.csv', index=False)"
]
},
{
"cell_type": "markdown",
"id": "bc7cce0e-6dcb-4d5a-82dd-e8074abaaaec",
"metadata": {},
"source": [
"<a name='s4-4'></a>\n",
"## Conclusion ##\n",
"Deep Learning is very good at feature extraction, which can be used for finding categorical embeddings. This is the advantage of using a Deep Learning approach, as it requires way less feature engineering and less dependent on domain knowledge. "
]
},
{
"cell_type": "markdown",
"id": "997bd6f7-9efb-4fee-b3d4-9d4454694c7b",
"metadata": {},
"source": [
"<a href=\"https://www.nvidia.com/dli\"> <img src=\"images/DLI_Header.png\" alt=\"Header\" style=\"width: 400px;\"/> </a>"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.16"
}
},
"nbformat": 4,
"nbformat_minor": 5
}

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

Binary file not shown.

After

Width:  |  Height:  |  Size: 26 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 200 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 282 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 192 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 24 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 340 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 792 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 270 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 230 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 942 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 152 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 45 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 74 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 706 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 176 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 859 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 129 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 25 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 320 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 374 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 66 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 222 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 140 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 107 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 241 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 75 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 743 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 159 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 21 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 43 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 9.2 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 170 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 148 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 11 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 952 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 186 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 442 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 56 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 70 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 75 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 72 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 571 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 30 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 514 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 166 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 112 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 256 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 262 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 265 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 18 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 207 KiB

Binary file not shown.

After

Width:  |  Height:  |  Size: 131 KiB

Some files were not shown because too many files have changed in this diff Show More