我有 2 个未排序的数组和这些数组的 2 个副本。我正在使用两个不同的线程对两个数组进行排序，然后我将另外两个未排序的数组一一排序。本来以为线程处理会更快，其实不然，线程怎么会花更多的时间呢？

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>

struct thread_data
{
    int count;
    unsigned int *arr;
};

struct thread_data thread_data_array[2];

void insertionSort(unsigned int arr[], int n)
{
   int i, key, j;
   for (i = 1; i < n; i++)
   {
       key = arr[i];
       j = i-1;

       while (j >= 0 && arr[j] > key)
       {
           arr[j+1] = arr[j];
           j = j-1;
       }
       arr[j+1] = key;
   }
}

void *sortAndMergeArrays(void *threadarg)
{
    int count;
    unsigned int *arr;
    struct thread_data *my_data;

    my_data = (struct thread_data *) threadarg;
    count = my_data->count;
    arr =  my_data->arr;

    insertionSort(arr, count);

    pthread_exit(NULL);
}

int main(int argc, char *argv[])
{
    int count, i, rc;
    clock_t start, end, total_t;
    pthread_t threads[2];

    //get the loop count. If loop count is not provided take 10000 as default loop count.
    if(argc == 2){
        count = atoi(argv[1]);
    }
    else{
        count = 10000;
    }

    unsigned int arr1[count], arr2[count], copyArr1[count], copyArr2[count];    

    srand(time(0));

    for(i = 0; i<count; i++){
        arr1[i] = rand();
        arr2[i] = rand();

        copyArr1[i] = arr1[i];
        copyArr2[i] = arr2[i];
    }

    start = clock();
    for(int t=0; t<2; t++) {
        thread_data_array[t].count = count;
        if(t==0)
            thread_data_array[t].arr = arr1;
        else
            thread_data_array[t].arr = arr2;    

        rc = pthread_create(&threads[t], NULL, sortAndMergeArrays, (void *) &thread_data_array[t]);
        if (rc) {
                printf("ERROR; return code from pthread_create() is %d\n", rc);
                exit(-1);
            }
    }

    pthread_join(threads[0], NULL);
    pthread_join(threads[1], NULL);
    end = clock();

    total_t = (double)(end - start);
    printf("Total time taken by CPU to sort using threads: %d\n", total_t);


    start = clock();
    insertionSort(copyArr1, count);
    insertionSort(copyArr2, count);
    end = clock();

    total_t = (double)(end - start);
    printf("Total time taken by CPU to sort sequentially: %d\n", total_t);

    pthread_exit(NULL);
}

我正在使用 Linux 服务器来执行代码。首先，我随机填充数组并将它们复制到两个单独的数组中。对于前两个数组，我使用 pthread 创建两个线程并将两个数组传递给它们，它使用插入排序对它们进行排序。对于其他两个数组，我只是一个一个地排序。

我希望通过使用线程可以减少执行时间，但实际上需要更多时间。

最佳答案

诊断

您获得几乎相同时间的原因——线程代码的时间比顺序代码的时间略多——是 clock()测量 CPU 时间，并且两种排序方式占用的 CPU 时间几乎相同，因为它们执行相同的工作(并且由于设置和拆除线程的时间，线程数可能略大)。

The clock() function shall return the implementation's best approximation to the processor time used by the process since the beginning of an implementation-defined era related only to the process invocation.

BSD (macOS) 手册页:

The clock() function determines the amount of processor time used since the invocation of the calling process, measured in CLOCKS_PER_SECs of a second.

对两个数组进行排序所花费的CPU时间量基本相同；区别在于线程的开销(或多或少)。

修改后的代码

我有一组函数可以使用 clock_gettime()相反(代码在 timer.c 和 timer.h 中 GitHub )。这些测量挂钟时间——耗时，而不是 CPU 时间。

这是您的代码的一个轻微调整版本 — 实质性的变化是将排序函数中 key 的类型从 int 更改为 unsigned int 匹配数组中的数据，并将%d的转换规范固定为%ld，以匹配GCC识别的类型为clock_t。我稍微调整了参数处理和计时消息，使它们的长度保持一致，并添加了耗时测量代码:

#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <time.h>
#include <pthread.h>
#include "timer.h"

struct thread_data
{
    int count;
    unsigned int *arr;
};

struct thread_data thread_data_array[2];

static
void insertionSort(unsigned int arr[], int n)
{
    for (int i = 1; i < n; i++)
    {
        unsigned int key = arr[i];
        int j = i - 1;

        while (j >= 0 && arr[j] > key)
        {
            arr[j + 1] = arr[j];
            j = j - 1;
        }
        arr[j + 1] = key;
    }
}

static
void *sortAndMergeArrays(void *threadarg)
{
    int count;
    unsigned int *arr;
    struct thread_data *my_data;

    my_data = (struct thread_data *)threadarg;
    count = my_data->count;
    arr =  my_data->arr;

    insertionSort(arr, count);

    pthread_exit(NULL);
}

int main(int argc, char *argv[])
{
    int count = 10000;
    int i, rc;
    clock_t start, end, total_t;
    pthread_t threads[2];

    // get the loop count. If loop count is not provided take 10000 as default loop count.
    if (argc == 2)
        count = atoi(argv[1]);

    unsigned int arr1[count], arr2[count], copyArr1[count], copyArr2[count];

    srand(time(0));

    for (i = 0; i < count; i++)
    {
        arr1[i] = rand();
        arr2[i] = rand();

        copyArr1[i] = arr1[i];
        copyArr2[i] = arr2[i];
    }

    Clock clk;
    clk_init(&clk);

    start = clock();
    clk_start(&clk);
    for (int t = 0; t < 2; t++)
    {
        thread_data_array[t].count = count;
        if (t == 0)
            thread_data_array[t].arr = arr1;
        else
            thread_data_array[t].arr = arr2;

        rc = pthread_create(&threads[t], NULL, sortAndMergeArrays, (void *)&thread_data_array[t]);
        if (rc)
        {
            printf("ERROR; return code from pthread_create() is %d\n", rc);
            exit(-1);
        }
    }

    pthread_join(threads[0], NULL);
    pthread_join(threads[1], NULL);
    clk_stop(&clk);
    end = clock();

    char buffer[32];
    printf("Elapsed using threads:  %s s\n", clk_elapsed_us(&clk, buffer, sizeof(buffer)));

    total_t = (double)(end - start);
    printf("CPU time using threads: %ld\n", total_t);

    start = clock();
    clk_start(&clk);
    insertionSort(copyArr1, count);
    insertionSort(copyArr2, count);
    clk_stop(&clk);
    end = clock();

    printf("Elapsed sequentially:   %s s\n", clk_elapsed_us(&clk, buffer, sizeof(buffer)));
    total_t = (double)(end - start);
    printf("CPU time sequentially:  %ld\n", total_t);

    return 0;
}

结果

运行示例(程序 inssortthread23)——在配备 16 GiB RAM 和 2.7 GHz Intel Core i7 CPU 的 MacBook Pro(15"2016)上运行，运行 macOS High Sierra 10.13，使用 GCC 7.2.0用于编译。我有常规的后台程序在运行——例如浏览器未被积极使用，没有音乐或视频播放，没有正在进行的下载等。(这些事情对基准测试很重要。)

$ inssortthread23 100000
Elapsed using threads:  1.060299 s
CPU time using threads: 2099441
Elapsed sequentially:   2.146059 s
CPU time sequentially:  2138465
$ inssortthread23 200000
Elapsed using threads:  4.332935 s
CPU time using threads: 8616953
Elapsed sequentially:   8.496348 s
CPU time sequentially:  8469327
$ inssortthread23 300000
Elapsed using threads:  9.984021 s
CPU time using threads: 19880539
Elapsed sequentially:   20.000900 s
CPU time sequentially:  19959341
$

结论

在这里，你可以清楚地看到:

非线程代码的运行时间大约是线程代码的两倍。
线程和非线程代码的 CPU 时间几乎相同。
总时间是排序行数的二次方。

所有这些都非常符合预期 — 一旦您意识到 clock() 正在测量 CPU 时间，而不是运行时间。

小谜题

您还可以看到，有时我得到的线程 CPU 时间略小于顺序排序的 CPU 时间。我对此没有任何解释——我认为它“迷失在噪音中”，尽管效果是持久的:

$ inssortthread23 100000
Elapsed using threads:  1.051229 s
CPU time using threads: 2081847
Elapsed sequentially:   2.138538 s
CPU time sequentially:  2132083
$ inssortthread23 100000
Elapsed using threads:  1.053656 s
CPU time using threads: 2089886
Elapsed sequentially:   2.128908 s
CPU time sequentially:  2122983
$ inssortthread23 100000
Elapsed using threads:  1.058283 s
CPU time using threads: 2093644
Elapsed sequentially:   2.126402 s
CPU time sequentially:  2120625
$

$ inssortthread23 200000
Elapsed using threads:  4.259660 s
CPU time using threads: 8479978
Elapsed sequentially:   8.872929 s
CPU time sequentially:  8843207
$ inssortthread23 200000
Elapsed using threads:  4.463954 s
CPU time using threads: 8883267
Elapsed sequentially:   8.603401 s
CPU time sequentially:  8580240
$ inssortthread23 200000
Elapsed using threads:  4.227154 s
CPU time using threads: 8411582
Elapsed sequentially:   8.816412 s
CPU time sequentially:  8797965
$

关于c - 使用 2 个线程对 2 个数组进行排序比将 2 个数组一一排序需要更多时间，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/46599290/

c - 使用 2 个线程对 2 个数组进行排序比将 2 个数组一一排序需要更多时间

诊断

修改后的代码

结果

结论

小谜题

上一篇：mysql - 将带有子线的行转换为单行

下一篇：Linux 剪切、粘贴