diff --git a/circuit/25-1/dz/24.11.pdf b/circuit/25-1/dz/24.11.pdf new file mode 100644 index 0000000..8fca779 Binary files /dev/null and b/circuit/25-1/dz/24.11.pdf differ diff --git a/circuit/25-2/course_project/drawio/bo-func.drawio b/circuit/25-2/course_project/drawio/bo-func.drawio index c8d33fe..2d88fef 100644 --- a/circuit/25-2/course_project/drawio/bo-func.drawio +++ b/circuit/25-2/course_project/drawio/bo-func.drawio @@ -1,365 +1,446 @@ - + - + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - + + - - - - - - - - + + - - + + - - + + - - + + - - + + - - + + - - + + - + - - + + - - + + - - + + - - + + - - + + - - + + - - - - + + + + - - - - - - - - - - - - - + + - - + + - - + + - - - - + + + + - - + + - - - - + + + + - - + + - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - + + - - + + - - + + - - + + - - - - + + + + - - - - + + + + - - + + - - + + - - - - + + + + - - + + - - - - + + + + - - - - + + - - + + + - - + + - - - - + + + + - - + + - - + + - + - - - + + + - - + + - - + + - - - - + + + + - - + + - - - - + + + + - - - + + + - - + + - - + + - - - - + + - - + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + diff --git a/circuit/25-2/course_project/img/bo-cs1.png b/circuit/25-2/course_project/img/bo-cs1.png new file mode 100644 index 0000000..dce6e5b Binary files /dev/null and b/circuit/25-2/course_project/img/bo-cs1.png differ diff --git a/circuit/25-2/course_project/img/bo-cs2.png b/circuit/25-2/course_project/img/bo-cs2.png new file mode 100644 index 0000000..c3ab574 Binary files /dev/null and b/circuit/25-2/course_project/img/bo-cs2.png differ diff --git a/circuit/25-2/course_project/img/bo-cs3.png b/circuit/25-2/course_project/img/bo-cs3.png new file mode 100644 index 0000000..753357b Binary files /dev/null and b/circuit/25-2/course_project/img/bo-cs3.png differ diff --git a/circuit/25-2/course_project/img/bo-func.png b/circuit/25-2/course_project/img/bo-func.png index a07f0ed..c6613d3 100644 Binary files a/circuit/25-2/course_project/img/bo-func.png and b/circuit/25-2/course_project/img/bo-func.png differ diff --git a/circuit/25-2/course_project/img/bo-ra.png b/circuit/25-2/course_project/img/bo-ra.png new file mode 100644 index 0000000..df4ce05 Binary files /dev/null and b/circuit/25-2/course_project/img/bo-ra.png differ diff --git a/circuit/25-2/course_project/img/bo-rb.png b/circuit/25-2/course_project/img/bo-rb.png new file mode 100644 index 0000000..5c4f9fb Binary files /dev/null and b/circuit/25-2/course_project/img/bo-rb.png differ diff --git a/circuit/25-2/course_project/img/bo-rpr.png b/circuit/25-2/course_project/img/bo-rpr.png new file mode 100644 index 0000000..9dd2e60 Binary files /dev/null and b/circuit/25-2/course_project/img/bo-rpr.png differ diff --git a/circuit/25-2/course_project/img/bo-rr.png b/circuit/25-2/course_project/img/bo-rr.png new file mode 100644 index 0000000..d75c197 Binary files /dev/null and b/circuit/25-2/course_project/img/bo-rr.png differ diff --git a/circuit/25-2/course_project/img/bo-scheme.png b/circuit/25-2/course_project/img/bo-scheme.png new file mode 100644 index 0000000..bd70504 Binary files /dev/null and b/circuit/25-2/course_project/img/bo-scheme.png differ diff --git a/circuit/25-2/course_project/img/bo-sm.png b/circuit/25-2/course_project/img/bo-sm.png new file mode 100644 index 0000000..cc91528 Binary files /dev/null and b/circuit/25-2/course_project/img/bo-sm.png differ diff --git a/circuit/25-2/course_project/main.tex b/circuit/25-2/course_project/main.tex index e4cbc2e..3b255e9 100644 --- a/circuit/25-2/course_project/main.tex +++ b/circuit/25-2/course_project/main.tex @@ -148,7 +148,7 @@ $$[A]_{\text{п}} = a_3, a_2 a_1 a_0$$ $b_0 = 1$ & & 0 & 1 & 1 & 1 & & & & \makecell[l]{RA} \\ \hline & 0 & 0 & 1 & 1 & 1 & 0 & 0 & 0 & \makecell[l]{RR} \\ & 0 & 0 & 0 & 1 & 1 & 1 & 0 & 0 & \makecell[l]{RR >> 1} \\ - $b_1 = 0$ & \multicolumn{8}{c}{\text{пропуск такта суммирования}} & \\ \hline + $b_1 = 0$ & \multicolumn{9}{l}{\text{пропуск такта суммирования}} \\ \hline & 0 & 0 & 0 & 1 & 1 & 1 & 0 & 0 & \makecell[l]{RR} \\ & 0 & 0 & 0 & 0 & 1 & 1 & 1 & 0 & \makecell[l]{RR >> 1} \\ $b_2 = 1$ & & 0 & 1 & 1 & 1 & & & & \makecell[l]{RA} \\ \hline @@ -222,8 +222,12 @@ $$[A]_{\text{п}} = a_3, a_2 a_1 a_0$$ Обознач & Назначение УМНОЖЕНИЕ & ПРОВЕРКА БИТА \\ \hline RA & Регистр 1-го операнда, 4 разряда, множимое, параллельная загрузка, хранение & позиция бита, параллельная загрузка \\ \hline RB & Регистр 2-го операнда, 4 разряда, множитель, параллельная загрузка, хранение & регистр данных, параллельная загрузка \\ \hline - RR & Регистр результата, 8 разрядов, параллельная загрузка, хранение, сдвиг вправо на 1 разряд & новое значение регистра RB, 4 разряда, параллельная загрузка, хранение \\ \hline + RR & Регистр результата, 8 разрядов, параллельная загрузка, хранение, сдвиг вправо на 1 разряд & новое значение регистра RB, 4 старших разряда, параллельная загрузка, хранение \\ \hline RPR & & Регистр флага CF, 1 разряд, параллельная загрузка, хранение \\ \hline + КС1 & Выбор множимого: RB или RR[6:3]. Формирование сигнала F0 для МУУ & \\ \hline + КС2 & Формирование знака для КС3 & Новое значение RB с установленным битом для КС3. Формирование RPR \\ \hline + КС3 & Загрузка результата суммирования в RR[6:3] или знака в RR[7] & Загрузка нового значения RB с установленным битом в RR[6:3] \\ \hline + SM & 4-разрядный сумматор. Пропуск такта & \\ \hline & & \end{tabular} \caption{Элементы функциональной схемы} @@ -231,4 +235,186 @@ $$[A]_{\text{п}} = a_3, a_2 a_1 a_0$$ \end{minipage} \end{figure} +\newpage +\subsection{Проектирование логических элементов блока операций} +\subsubsection{Регистр первого операнда RA} + +Данный регистр является четырёхразрядным регистром хранения. Наиболее подходящим для реализации функций регистра RA является регистр FD4CE + +\begin{minipage}{0.3\textwidth} + \centering + Таблица управляющих сигналов RA + \begin{tabular}{|c|c|} + \hline + Y0 & Назначение \\ \hline + 0 & Хранение \\ \hline + 1 & Загрузка \\ \hline + \end{tabular} +\end{minipage} +\begin{minipage}{0.69\textwidth} + \begin{figure}[H] + \centering + \includegraphics[width=0.3\textwidth]{bo-ra} + \caption{Логическая схема RA} + \label{bo-ra} + \end{figure} +\end{minipage} + +\subsubsection{Регистр второго операнда RB} + +Данный регистр является четырёхразрядным регистром хранения. Наиболее подходящим для реализации функций регистра RB является регистр FD4CE + +\begin{minipage}{0.3\textwidth} + \centering + Таблица управляющих сигналов RB + \begin{tabular}{|c|c|} + \hline + Y0 & Назначение \\ \hline + 0 & Хранение \\ \hline + 1 & Загрузка \\ \hline + \end{tabular} +\end{minipage} +\begin{minipage}{0.69\textwidth} + \begin{figure}[H] + \centering + \includegraphics[width=0.3\textwidth]{bo-rb} + \caption{Логическая схема RB} + \label{bo-rb} + \end{figure} +\end{minipage} + +\subsubsection{Регистр результата RR} + +Данный регистр является 8-разрядным регистром с логическим сдвигом вправо на 1 разряд, хранением, параллельной загрузкой. Наиболее подходящим для реализации функции регистра RR является регистр SR8CLE + +\begin{minipage}{0.3\textwidth} + \centering + Таблица управляющих сигналов RR + \begin{tabular}{|c|c|c|} + \hline + Y5 & Y4 & Назначение \\ \hline + 0 & 0 & Хранение \\ \hline + 0 & 1 & Загрузка \\ \hline + 1 & 0 & SHR \\ \hline + \end{tabular} +\end{minipage} +\begin{minipage}{0.69\textwidth} + \begin{figure}[H] + \centering + \includegraphics[width=0.8\textwidth]{bo-rr} + \caption{Логическая схема RR} + \label{bo-rr} + \end{figure} +\end{minipage} + +\subsubsection{Регистр признака RPR} + +Данный регистр является 1-разрядным с хранением и загрузкой. Наиболее подходящим для реализации функции регистра RPR является + +\begin{minipage}{0.3\textwidth} + \centering + Таблица управляющих сигналов RPR + \begin{tabular}{|c|c|} + \hline + Y7 & Назначение \\ \hline + 0 & Хранение \\ \hline + 1 & Загрузка \\ \hline + \end{tabular} +\end{minipage} +\begin{minipage}{0.69\textwidth} + \begin{figure}[H] + \centering + \includegraphics[width=0.3\textwidth]{bo-rpr} + \caption{Логическая схема RPR} + \label{bo-rpr} + \end{figure} +\end{minipage} + +\subsubsection{Комбинационная схема КC1} + +С помощью комбинационной схемы КС1 осуществляется выбор множимого из RB или RR[6:3], а также формируется сигнал F0 для МУУ + +\begin{minipage}{0.3\textwidth} + \centering + Таблица управляющих сигналов КС1 + \begin{tabular}{|c|c|} + \hline + Y6 & Назначение \\ \hline + 0 & Выбор RB \\ \hline + 1 & Выбор RR[6:3] \\ \hline + \end{tabular} +\end{minipage} +\begin{minipage}{0.69\textwidth} + \begin{figure}[H] + \centering + \includegraphics[width=0.8\textwidth]{bo-cs1} + \caption{Логическая схема КС1} + \label{bo-cs1} + \end{figure} +\end{minipage} + +\newpage +\subsubsection{Комбинационная схема КC2} + +С помощью комбинационной схемы КС2 осуществляется формирование знака для операции умножени, формирование нового значения RB с установленным битом и формирование флага CF для операции установки бита + +\begin{figure}[H] + \centering + \includegraphics[width=0.6\textwidth]{bo-cs2} + \caption{Логическая схема КС2} + \label{bo-cs2} +\end{figure} + +\newpage +\subsubsection{Комбинационная схема КC3} + +С помощью комбинационной схемы КС3 осуществляется загрузка промежуточной суммы в RR[6:3] и знака в RR[7] в операции умножения, нового значения RB с установленными битами в RR[6:3] в операции установки бита + +\begin{minipage}{0.3\textwidth} + \centering + Таблица управляющих сигналов КС3 + \begin{tabular}{|c|c|c|} + \hline + Y3 & Y2 & Назначение \\ \hline + 0 & 0 & сумма в RR[6:3] \\ \hline + 0 & 1 & знак RR[7] \\ \hline + 1 & 0 & значение RB в RR[6:3] \\ \hline + \end{tabular} +\end{minipage} +\begin{minipage}{0.69\textwidth} + \begin{figure}[H] + \centering + \includegraphics[width=0.8\textwidth]{bo-cs3} + \caption{Логическая схема КС3} + \label{bo-cs3} + \end{figure} +\end{minipage} + +\newpage +\subsubsection{Сумматор SM} + +4-разрядный, старшие разряды множимого и множителя, carry input установлены в 0 + +\begin{minipage}{\textwidth} + \begin{figure}[H] + \centering + \includegraphics[width=0.6\textwidth]{bo-sm} + \caption{Логическая схема SM} + \label{bo-sm} + \end{figure} +\end{minipage} + +\newpage +\subsection{Логическая схема блока операций} + +\begin{minipage}{\textwidth} + \begin{figure}[H] + \centering + \includegraphics[width=\textwidth]{bo-scheme} + \caption{Логическая схема БО} + \label{bo-scheme} + \end{figure} +\end{minipage} + + \end{document} diff --git a/ds/25-1/1e/p2pkh.ipynb b/ds/25-1/1e/p2pkh.ipynb deleted file mode 100644 index 08a790b..0000000 --- a/ds/25-1/1e/p2pkh.ipynb +++ /dev/null @@ -1,386 +0,0 @@ -{ - "cells": [ - { - "cell_type": "code", - "execution_count": 2, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "from numba import cuda\n", - "from numba import vectorize\n", - "import numpy as np" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "U8_MAX = 0xFFFFFFFFFFFFFFFF\n", - "U32_MAX = 0xFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFFF\n", - "\n", - "def bignum_to_u32(num):\n", - " res = np.empty(4, np.uint64)\n", - " res[0] = num & U8_MAX\n", - " res[1] = (num >> 64) & U8_MAX\n", - " res[2] = (num >> 128) & U8_MAX\n", - " res[3] = (num >> 192) & U8_MAX\n", - " return res\n", - "\n", - "def u32_to_bignum(arr):\n", - " return int(arr[3]) << 192 | \\\n", - " int(arr[2]) << 128 | \\\n", - " int(arr[1]) << 64 | \\\n", - " int(arr[0]) \n", - "\n", - "Gx = bignum_to_u32(55066263022277343669578718895168534326250603453777594175500187360389116729240)\n", - "Gy = bignum_to_u32(32670510020758816978083085130507043184471273380659243275938904335757337482424)\n", - "p = bignum_to_u32(2**256 - 2**32 - 2**9 - 2**8 - 2**7 - 2**6 - 2**4 - 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 67, - "metadata": {}, - "outputs": [], - "source": [ - "@cuda.jit('void(u8[:], u8[:], u8[:])', device=True)\n", - "def add_u32(a, b, out):\n", - " carry = np.uint64(0)\n", - " for i in range(4):\n", - " ai = a[i]\n", - " bi = b[i]\n", - " outi = ai + bi\n", - " out[i] = outi + carry\n", - " carry = outi < ai or outi == U8_MAX and carry\n", - "\n", - "@cuda.jit('void(u8[:], u8[:], u8[:])', device=True)\n", - "def sub_u32(a, b, out):\n", - " borrow = np.uint64(0)\n", - " for i in range(4):\n", - " ai = a[i]\n", - " bi = b[i]\n", - " outi = ai - bi\n", - " out[i] = outi - borrow\n", - " borrow = ai < bi or outi == 0 and borrow\n", - "\n", - "@cuda.jit('void(u8[:], u8)', device=True)\n", - "def shr_u32(out, bits):\n", - " bits = np.uint64(bits)\n", - " lost = np.uint64(0)\n", - " for i in range(4):\n", - " outi = out[3 - i]\n", - " out[3 - i] = (outi >> bits) | lost\n", - " lost = outi << (np.uint64(64) - bits)\n", - " \n", - "@cuda.jit('void(u8[:], u8[:], u8[:])', device=True)\n", - "def mul_u32(a, b, out):\n", - " for i in range(4):\n", - " out[i] = 0\n", - " \n", - " for i in range(255, -1, -1):\n", - " quad_pos = np.uint64(i // 64)\n", - " bit_pos = np.uint64(i % 64)\n", - " bit = b[quad_pos] >> bit_pos\n", - " if bit % 2 == 1:\n", - " add_u32(a, out, out)\n", - " \n", - " shr_u32(out, 1)" - ] - }, - { - "cell_type": "code", - "execution_count": 78, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "failed on shr 0b0 0b110000000110100101\n", - "failed on shr 0b11111111111111111111111111111111111111111111111111111111111111111111111111111111111111 0b1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111011111100100000111011\n", - "failed on shr 0b11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 0b111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111010100000101111100111110110011110100011011000010010\n", - "failed on shr 0b111111111111111111111111111111111111111111111111111 0b111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111110101100000010111101100110100111110011000000011010\n", - "failed on shr 0b1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 0b11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111101011110\n", - "failed on shr 0b1111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111 0b11111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111111100010100000001110001001100001111\n", - "failed on shr 0b0 0b11110100110111011010100011011110011010000010011101010\n", - "failed on shr 0b0 0b10000010101011110110111000000100100010011110111111011101111\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/appuser/Miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:7: RuntimeWarning: overflow encountered in ulong_scalars\n", - " import sys\n", - "/home/appuser/Miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:28: RuntimeWarning: overflow encountered in ulong_scalars\n", - "/home/appuser/Miniconda3/lib/python3.6/site-packages/ipykernel_launcher.py:17: RuntimeWarning: overflow encountered in ulong_scalars\n" - ] - } - ], - "source": [ - "from random import randint\n", - "\n", - "@cuda.jit('void(u8[:], u8[:], u8[:])')\n", - "def mul_u32_ker(a, b, out):\n", - " mul_u32(a, b, out)\n", - "\n", - "for i in range(10):\n", - " a = randint(0, U8_MAX)\n", - " b = randint(0, U8_MAX)\n", - " bit = randint(0, 256)\n", - " a32 = bignum_to_u32(a)\n", - " b32 = bignum_to_u32(b)\n", - " out32 = np.empty(4, np.uint64)\n", - " \n", - " add_u32.py_func(a32, b32, out32)\n", - " out = (a + b) % (U32_MAX + 1)\n", - " if out != u32_to_bignum(out32):\n", - " print(\"failed on add\")\n", - " \n", - " sub_u32.py_func(a32, b32, out32)\n", - " out = (a - b) % (U32_MAX + 1)\n", - " if out != u32_to_bignum(out32):\n", - " print(\"failed on sub\")\n", - " \n", - " shr_u32.py_func(out32, bit)\n", - " out >>= bit\n", - " if out != u32_to_bignum(out32):\n", - " print(\"failed on shr\", bin(out), bin(u32_to_bignum(out32)))\n", - " \n", - " #mul_u32_ker(a32, b32, out32)\n", - " #out = (a * b) % (U32_MAX + 1)\n", - " #if out != u32_to_bignum(out32):\n", - " # print(\"failed on mul\", a, b, u32_to_bignum(out32))\n", - " " - ] - }, - { - "cell_type": "code", - "execution_count": 43, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [ - "@cuda.jit(device=True)\n", - "def cmp_ge256(a, b):\n", - " # return True if a >= b\n", - " # compare from most significant word\n", - " for i in range(3, -1, -1):\n", - " ai = a[i]\n", - " bi = b[i]\n", - " if ai > bi:\n", - " return True\n", - " elif ai < bi:\n", - " return False\n", - " return True # equal\n", - "\n", - "@cuda.jit(device=True)\n", - "def sub256(a, b, out):\n", - " # assume a >= b, perform out = a - b\n", - " borrow = 0\n", - " for i in range(4):\n", - " ai = a[i]\n", - " bi = b[i]\n", - " # compute ai - bi - borrow\n", - " tmp = ai - bi\n", - " borrow1 = 1 if tmp > ai else 0 # tmp < 0 -> wrapping -> tmp > ai\n", - " tmp2 = tmp - borrow\n", - " borrow2 = 1 if tmp2 > tmp else 0\n", - " out[i] = tmp2\n", - " borrow = 1 if (borrow1 or borrow2) else 0\n", - " # if borrow != 0 then a < b (but we assumed a>=b)\n", - "\n", - "# 64x64 -> 128 using 32-bit split\n", - "@cuda.jit(device=True)\n", - "def mul64wide(x, y):\n", - " MASK32 = (1 << 32) - 1\n", - " x_lo = x & MASK32\n", - " x_hi = x >> 32\n", - " y_lo = y & MASK32\n", - " y_hi = y >> 32\n", - "\n", - " p0 = x_lo * y_lo # <= 64 bits\n", - " p1 = x_lo * y_hi # <= 64 bits\n", - " p2 = x_hi * y_lo # <= 64 bits\n", - " p3 = x_hi * y_hi # <= 64 bits\n", - "\n", - " # combine: p0 + (p1<<32) + (p2<<32) + (p3<<64)\n", - " mid = p1 + p2\n", - " carry_mid = 0\n", - " # lower 64:\n", - " low = (p0 + ((mid & MASK32) << 32)) & ((1 << 64) - 1)\n", - " # carry from lower additions\n", - " if (p0 + ((mid & MASK32) << 32)) >> 64:\n", - " carry_mid = 1\n", - " high = p3 + (mid >> 32) + carry_mid\n", - " # low, high are 64-bit parts of 128-bit product\n", - " return low & ((1 << 64) - 1), high & ((1 << 64) - 1)\n", - "\n", - "@cuda.jit(device=True)\n", - "def mul256_full(a, b, out8):\n", - " # out8 must be length 8 (little-endian) to hold full 512-bit product\n", - " # Initialize\n", - " for i in range(8):\n", - " out8[i] = 0\n", - "\n", - " # schoolbook: for i in 0..3, j in 0..3\n", - " temp = cuda.local.array(8, dtype=np.uint64) # local accumulator\n", - " for i in range(8):\n", - " temp[i] = 0\n", - "\n", - " for i in range(4):\n", - " ai = a[i]\n", - " for j in range(4):\n", - " bj = b[j]\n", - " lo, hi = mul64wide(ai, bj) # 128-bit product\n", - " k = i + j\n", - " # add lo to temp[k], handle carry\n", - " s, c = add64_carry(temp[k], lo, 0)\n", - " temp[k] = s\n", - " # propagate carry to next word together with hi\n", - " carry = hi + c\n", - " t_idx = k + 1\n", - " while carry != 0:\n", - " s2, c2 = add64_carry(temp[t_idx], carry, 0)\n", - " temp[t_idx] = s2\n", - " # compute new carry (0/1) from addition\n", - " carry = 1 if c2 else 0\n", - " t_idx += 1\n", - " # t_idx never exceeds 7 because i+j <= 6, plus propagation safe\n", - " # copy to out8\n", - " for i in range(8):\n", - " out8[i] = temp[i]\n", - "\n", - "@cuda.jit(device=True)\n", - "def mul256_lo(a, b, out4):\n", - " # compute full product and keep lower 4 words\n", - " out8 = cuda.local.array(8, dtype=np.uint64)\n", - " mul256_full(a, b, out8)\n", - " for i in range(4):\n", - " out4[i] = out8[i]\n", - "\n", - "# --------------------\n", - "# division: binary long division\n", - "# --------------------\n", - "@cuda.jit(device=True)\n", - "def shl1_256_inplace(x):\n", - " # x <<= 1 (inplace)\n", - " carry = 0\n", - " for i in range(4):\n", - " new_carry = (x[i] >> 63) & 1\n", - " x[i] = (x[i] << 1) & ((1 << 64) - 1)\n", - " x[i] |= carry\n", - " carry = new_carry\n", - "\n", - "@cuda.jit(device=True)\n", - "def shr1_256_getbit(x):\n", - " # shift right by 1, return LSB (bit 0) before shift\n", - " lsb = x[0] & 1\n", - " carry = 0\n", - " for i in range(3, -1, -1):\n", - " new_carry = x[i] & 1\n", - " x[i] = (x[i] >> 1) | (carry << 63)\n", - " carry = new_carry\n", - " return lsb\n", - "\n", - "@cuda.jit(device=True)\n", - "def get_bit256(x, idx):\n", - " # idx: 0..255, 0 = least significant bit\n", - " w = idx // 64\n", - " b = idx % 64\n", - " return (x[w] >> b) & 1\n", - "\n", - "@cuda.jit(device=True)\n", - "def set_bit256(x, idx):\n", - " w = idx // 64\n", - " b = idx % 64\n", - " x[w] |= (1 << b)\n", - "\n", - "@cuda.jit(device=True)\n", - "def copy256(src, dst):\n", - " for i in range(4):\n", - " dst[i] = src[i]\n", - "\n", - "@cuda.jit(device=True)\n", - "def zero256(x):\n", - " for i in range(4):\n", - " x[i] = 0\n", - "\n", - "@cuda.jit(device=True)\n", - "def divmod256(dividend, divisor, q, r):\n", - " # Binary long division (restoring), bit-by-bit from MSB..LSB\n", - " # q,r are output arrays (4 words). dividend/divisor are input arrays.\n", - " # edge cases\n", - " zero = True\n", - " for i in range(4):\n", - " if divisor[i] != 0:\n", - " zero = False\n", - " break\n", - " if zero:\n", - " # division by zero — define q=0,r=dividend (user must avoid)\n", - " for i in range(4):\n", - " q[i] = 0\n", - " r[i] = dividend[i]\n", - " return\n", - "\n", - " zero256(q)\n", - " zero256(r)\n", - " # iterate bits from highest (255) down to 0\n", - " for i in range(255, -1, -1):\n", - " # left shift r by 1\n", - " shl1_256_inplace(r)\n", - " # bring in bit i of dividend\n", - " if get_bit256(dividend, i):\n", - " r[0] |= 1 # set lsb\n", - " # if r >= divisor then r -= divisor and set q[i] = 1\n", - " if cmp_ge256(r, divisor):\n", - " # r = r - divisor\n", - " tmp = cuda.local.array(4, dtype=np.uint64)\n", - " sub256(r, divisor, tmp)\n", - " for k in range(4):\n", - " r[k] = tmp[k]\n", - " # set q bit i\n", - " w = i // 64\n", - " b = i % 64\n", - " q[w] |= (1 << b)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": { - "collapsed": true - }, - "outputs": [], - "source": [] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.6.10" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -}