Estructura de matrices vs Matriz de Estructuras en CUDA

Question

Estructura de matrices vs Matriz de Estructuras en CUDA

De algunos comentarios que he leído aquí, por alguna razón es preferible tener Structure of Arrays (SoA) cambio Array of Structures (AoS) para implementaciones paralelas como CUDA? Si eso es cierto, ¿puede alguien explicar por qué? Gracias de antemano!

38

c struct arrays cuda

Author: JackOLantern, 2013-07-29

Source

3 answers

Solo quiero proporcionar un ejemplo simple que muestre cómo una Estructura de Matrices (SoA) funciona mejor que una Matriz de Estructuras (AoS).

En el ejemplo, estoy considerando tres versiones diferentes del mismo código:

SoA (v1)
Matrices rectas (v2)
AoS (v3)

En particular, la versión 2 considera el uso de arrays rectos. Los tiempos de las versiones 2 y 3 son los mismos para este ejemplo y el resultado es mejor que la versión 1. Sospecho que, en general, los arrays rectos podrían ser preferibles, aunque a expensas de la legibilidad, ya que, por ejemplo, la carga desde uniform cache podría habilitarse a través de const __restrict__ para este caso.

#include "cuda_runtime.h"
#include "device_launch_parameters.h"

#include <stdio.h>

#include <thrust\device_vector.h>

#include "Utilities.cuh"
#include "TimingGPU.cuh"

#define BLOCKSIZE   1024

/******************************************/
/* CELL STRUCT LEADING TO ARRAY OF STRUCT */
/******************************************/
struct cellAoS {

    unsigned int    x1;
    unsigned int    x2;
    unsigned int    code;
    bool            done;

};

/*******************************************/
/* CELL STRUCT LEADING TO STRUCT OF ARRAYS */
/*******************************************/
struct cellSoA {

    unsigned int    *x1;
    unsigned int    *x2;
    unsigned int    *code;
    bool            *done;

};


/*******************************************/
/* KERNEL MANIPULATING THE ARRAY OF STRUCT */
/*******************************************/
__global__ void AoSvsSoA_v1(cellAoS *d_cells, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {
        cellAoS tempCell = d_cells[tid];

        tempCell.x1 = tempCell.x1 + 10;
        tempCell.x2 = tempCell.x2 + 10;

        d_cells[tid] = tempCell;
    }

}

/******************************/
/* KERNEL MANIPULATING ARRAYS */
/******************************/
__global__ void AoSvsSoA_v2(unsigned int * __restrict__ d_x1, unsigned int * __restrict__ d_x2, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

        d_x1[tid] = d_x1[tid] + 10;
        d_x2[tid] = d_x2[tid] + 10;

    }

}

/********************************************/
/* KERNEL MANIPULATING THE STRUCT OF ARRAYS */
/********************************************/
__global__ void AoSvsSoA_v3(cellSoA cell, const int N) {

    const int tid = threadIdx.x + blockIdx.x * blockDim.x;

    if (tid < N) {

        cell.x1[tid] = cell.x1[tid] + 10;
        cell.x2[tid] = cell.x2[tid] + 10;

    }

}

/********/
/* MAIN */
/********/
int main() {

    const int N = 2048 * 2048 * 4;

    TimingGPU timerGPU;

    thrust::host_vector<cellAoS>    h_cells(N);
    thrust::device_vector<cellAoS>  d_cells(N);

    thrust::host_vector<unsigned int>   h_x1(N);
    thrust::host_vector<unsigned int>   h_x2(N);

    thrust::device_vector<unsigned int> d_x1(N);
    thrust::device_vector<unsigned int> d_x2(N);

    for (int k = 0; k < N; k++) {

        h_cells[k].x1 = k + 1;
        h_cells[k].x2 = k + 2;
        h_cells[k].code = k + 3;
        h_cells[k].done = true;

        h_x1[k] = k + 1;
        h_x2[k] = k + 2;

    }

    d_cells = h_cells;

    d_x1 = h_x1;
    d_x2 = h_x2;

    cellSoA cell;
    cell.x1 = thrust::raw_pointer_cast(d_x1.data());
    cell.x2 = thrust::raw_pointer_cast(d_x2.data());
    cell.code = NULL;
    cell.done = NULL;

    timerGPU.StartCounter();
    AoSvsSoA_v1 << <iDivUp(N, BLOCKSIZE), BLOCKSIZE >> >(thrust::raw_pointer_cast(d_cells.data()), N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    printf("Timing AoSvsSoA_v1 = %f\n", timerGPU.GetCounter());

    //timerGPU.StartCounter();
    //AoSvsSoA_v2 << <iDivUp(N, BLOCKSIZE), BLOCKSIZE >> >(thrust::raw_pointer_cast(d_x1.data()), thrust::raw_pointer_cast(d_x2.data()), N);
    //gpuErrchk(cudaPeekAtLastError());
    //gpuErrchk(cudaDeviceSynchronize());
    //printf("Timing AoSvsSoA_v2 = %f\n", timerGPU.GetCounter());

    timerGPU.StartCounter();
    AoSvsSoA_v3 << <iDivUp(N, BLOCKSIZE), BLOCKSIZE >> >(cell, N);
    gpuErrchk(cudaPeekAtLastError());
    gpuErrchk(cudaDeviceSynchronize());
    printf("Timing AoSvsSoA_v3 = %f\n", timerGPU.GetCounter());

    h_cells = d_cells;

    h_x1 = d_x1;
    h_x2 = d_x2;

    // --- Check results
    for (int k = 0; k < N; k++) {
        if (h_x1[k] != k + 11) {
            printf("h_x1[%i] not equal to %i\n", h_x1[k], k + 11);
            break;
        }
        if (h_x2[k] != k + 12) {
            printf("h_x2[%i] not equal to %i\n", h_x2[k], k + 12);
            break;
        }
        if (h_cells[k].x1 != k + 11) {
            printf("h_cells[%i].x1 not equal to %i\n", h_cells[k].x1, k + 11);
            break;
        }
        if (h_cells[k].x2 != k + 12) {
            printf("h_cells[%i].x2 not equal to %i\n", h_cells[k].x2, k + 12);
            break;
        }
    }

}

Los siguientes son los tiempos (corridas realizadas en un GTX960):

Array of struct        9.1ms (v1 kernel)
Struct of arrays       3.3ms (v3 kernel)
Straight arrays        3.2ms (v2 kernel)

3

Author: JackOLantern,
Warning: date(): Invalid date.timezone value 'Europe/Kyiv', we selected the timezone 'UTC' for now. in /var/www/agent_stack/data/www/ajaxhispano.com/template/agent.layouts/content.php on line 61
2017-10-11 16:59:11

SoA es efectivamente bueno para el procesamiento SIMD. Por varias razones, pero básicamente es más eficiente cargar 4 flotadores consecutivos en un registro. Con algo como:

 float v [4] = {0};
 __m128 reg = _mm_load_ps( v );

Que usando:

 struct vec { float x; float, y; ....} ;
 vec v = {0, 0, 0, 0};

Y crear un __m128 datos accediendo a todos los miembros:

 __m128 reg = _mm_set_ps(v.x, ....);

Si sus matrices están alineadas con 16 bytes, la carga/almacenamiento de datos son más rápidos y algunos op se pueden realizar directamente en memoria.

1

Author: alexbuisson,
Warning: date(): Invalid date.timezone value 'Europe/Kyiv', we selected the timezone 'UTC' for now. in /var/www/agent_stack/data/www/ajaxhispano.com/template/agent.layouts/content.php on line 61
2013-07-29 14:04:54

score 48 · Accepted Answer

La elección de AoS versus SoA para un rendimiento óptimo generalmente depende del patrón de acceso. Sin embargo, esto no se limita solo a CUDA: consideraciones similares se aplican para cualquier arquitectura donde el rendimiento puede verse afectado significativamente por el patrón de acceso a la memoria, por ejemplo, donde tiene cachés o donde el rendimiento es mejor con acceso a memoria contigua (por ejemplo, accesos a memoria fusionados en CUDA).

Por ejemplo, para píxeles RGB frente a planos RGB separados:

struct {
    uint8_t r, g, b;
} AoS[N];

struct {
    uint8_t r[N];
    uint8_t g[N];
    uint8_t b[N];
} SoA;

Si va a estar accediendo los componentes R/G / B de cada píxel simultáneamente entonces AoS generalmente tiene sentido, ya que las lecturas sucesivas de los componentes R, G, B serán contiguas y generalmente contenidas dentro de la misma línea de caché. Para CUDA esto también significa coalescencia de lectura/escritura de memoria.

Sin embargo, si va a procesar planos de color por separado, entonces puede preferirse SoA, por ejemplo, si desea escalar todos los valores de R por algún factor de escala, entonces SoA significa que todos los componentes de R serán contiguos.

Uno más la consideración es relleno / alineación. Para el ejemplo RGB anterior, cada elemento en un diseño AoS está alineado a un múltiplo de 3 bytes, lo que puede no ser conveniente para CUDA, SIMD, et al - en algunos casos, tal vez incluso requiera relleno dentro de la estructura para hacer la alineación más conveniente (por ejemplo, agregar un elemento uint8_t ficticio para garantizar la alineación de 4 bytes). En el caso SoA, sin embargo, los planos están alineados en bytes, lo que puede ser más conveniente para ciertos algoritmos/arquitecturas.

Para la mayoría del procesamiento de imágenes el escenario AoS es mucho más común, pero para otras aplicaciones, o para tareas específicas de procesamiento de imágenes, esto puede no ser siempre el caso. Cuando no hay una opción obvia, recomendaría AoS como la opción predeterminada.

Ver también esta respuesta para una discusión más general de AoS v SoA.