c - 重复文件查找算法的建议(使用 C)

标签 c file mapping posix pthreads

我想编写一个程序来测试两个文件是否重复(内容完全相同)。首先,我测试文件是否具有相同的大小,如果有,我开始比较它们的内容。

我的第一个想法是将文件“拆分”为固定大小的 block ,然后为每个 block 启动一个线程,fseek 以启动每个 block 的字符并继续并行比较。当一个线程的比较失败时,其他工作线程被取消,程序退出线程生成循环。

代码如下所示:
dupf.h

#ifndef __NM__DUPF__H__
#define __NM__DUPF__H__
#define NUM_THREADS 15
#define BLOCK_SIZE 8192

/* Thread argument structure */
struct thread_arg_s {
    const char *name_f1;        /* First file name */
    const char *name_f2;        /* Second file name */
    int cursor;                 /* Where to seek in the file */
};
typedef struct thread_arg_s thread_arg;

/**
 * 'arg' is of type thread_arg.
 * Checks if the specified file blocks are 
 * duplicates.
 */
void *check_block_dup(void *arg);

/**
 * Checks if two files are duplicates
 */
int check_dup(const char *name_f1, const char *name_f2);

/**
* Returns a valid pointer to a file.
* If the file (given by the path/name 'fname') cannot be opened
* in 'mode', the program is interrupted an error message is shown.
**/
FILE *safe_fopen(const char *name, const char *mode);

#endif

dupf.c
#include <errno.h>
#include <pthread.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#include "dupf.h"

FILE *safe_fopen(const char *fname, const char *mode)
{
    FILE *f = NULL;
    f = fopen(fname, mode);
    if (f == NULL) {
        char emsg[255];
        sprintf(emsg, "FOPEN() %s\t", fname);
        perror(emsg);
        exit(-1);
    }
    return (f);
}

void *check_block_dup(void *arg)
{
    const char *name_f1 = NULL, *name_f2 = NULL;    /* File names */
    FILE *f1 = NULL, *f2 = NULL;                    /* Streams */
    int cursor = 0;                                 /* Reading cursor */
    char buff_f1[BLOCK_SIZE], buff_f2[BLOCK_SIZE];  /* Character buffers */
    int rchars_1, rchars_2;                         /* Readed characters */
    /* Initializing variables from 'arg' */
    name_f1 = ((thread_arg*)arg)->name_f1;
    name_f2 = ((thread_arg*)arg)->name_f2;
    cursor = ((thread_arg*)arg)->cursor;
    /* Opening files */
    f1 = safe_fopen(name_f1, "r");
    f2 = safe_fopen(name_f2, "r");
    /* Setup cursor in files */
    fseek(f1, cursor, SEEK_SET);
    fseek(f2, cursor, SEEK_SET);
    /* Initialize buffers */
    rchars_1 = fread(buff_f1, 1, BLOCK_SIZE, f1);
    rchars_2 = fread(buff_f2, 1, BLOCK_SIZE, f2);
    if (rchars_1 != rchars_2) {
        /* fread failed to read the same portion.
         * program cannot continue */
        perror("ERROR WHEN READING BLOCK");
        exit(-1);
    }
    while (rchars_1-->0) {
        if (buff_f1[rchars_1] != buff_f2[rchars_1]) {
            /* Different characters */
            fclose(f1);
            fclose(f2);
            pthread_exit("notdup");
        }
    }
    /* Close streams */
    fclose(f1);
    fclose(f2);
    pthread_exit("dup");
}

int check_dup(const char *name_f1, const char *name_f2)
{
    int num_blocks = 0;             /* Number of 'blocks' to check */
    int num_tsp = 0;                /* Number of threads spawns */
    int tsp_iter = 0;               /* Iterator for threads spawns */
    pthread_t *tsp_threads = NULL;
    thread_arg *tsp_threads_args = NULL;
    int tsp_threads_iter = 0;
    int thread_c_res = 0;           /* Thread creation result */
    int thread_j_res = 0;           /* Thread join res */
    int loop_res = 0;               /* Function result */
    int cursor;
    struct stat buf_f1;
    struct stat buf_f2;

    if (name_f1 == NULL || name_f2 == NULL) {
        /* Invalid input parameters */
        perror("INVALID FNAMES\t");
        return (-1);
    }

    if (stat(name_f1, &buf_f1) != 0 || stat(name_f2, &buf_f2) != 0) {
        /* Stat fails */
        char emsg[255];
        sprintf(emsg, "STAT() ERROR: %s %s\t", name_f1, name_f2);
        perror(emsg);
        return (-1);
    }

    if (buf_f1.st_size != buf_f2.st_size) {
        /* File have different sizes */
        return (1);
    }

    /* Files have the same size, function exec. is continued */
    num_blocks = (buf_f1.st_size / BLOCK_SIZE) + 1;
    num_tsp = (num_blocks / NUM_THREADS) + 1;
    cursor = 0;
    for (tsp_iter = 0; tsp_iter < num_tsp; tsp_iter++) {
        loop_res = 0;
        /* Create threads array for this spawn */
        tsp_threads = malloc(NUM_THREADS * sizeof(*tsp_threads));
        if (tsp_threads == NULL) {
            perror("TSP_THREADS ALLOC FAILURE\t");
            return (-1);
        }
        /* Create arguments for every thread in the current spawn */
        tsp_threads_args = malloc(NUM_THREADS * sizeof(*tsp_threads_args));
        if (tsp_threads_args == NULL) {
            perror("TSP THREADS ARGS ALLOCA FAILURE\t");
            return (-1);
        }
        /* Initialize arguments and create threads */
        for (tsp_threads_iter = 0; tsp_threads_iter < NUM_THREADS;
                tsp_threads_iter++) {
            if (cursor >= buf_f1.st_size) {
                break;
            }
            tsp_threads_args[tsp_threads_iter].name_f1 = name_f1;
            tsp_threads_args[tsp_threads_iter].name_f2 = name_f2;
            tsp_threads_args[tsp_threads_iter].cursor = cursor;
            thread_c_res = pthread_create(
                               &tsp_threads[tsp_threads_iter],
                               NULL,
                               check_block_dup,
                               (void*)&tsp_threads_args[tsp_threads_iter]);
            if (thread_c_res != 0) {
                perror("THREAD CREATION FAILURE");
                return (-1);
            }
            cursor+=BLOCK_SIZE;
        }
        /* Join last threads and get their status */
        while (tsp_threads_iter-->0) {
            void *thread_res = NULL;
            thread_j_res = pthread_join(tsp_threads[tsp_threads_iter],
                                        &thread_res);
            if (thread_j_res != 0) {
                perror("THREAD JOIN FAILURE");
                return (-1);
            }
            if (strcmp((char*)thread_res, "notdup")==0) {
                loop_res++;
                /* Closing other threads and exiting by condition
                 * from loop. */
                while (tsp_threads_iter-->0) {
                    pthread_cancel(tsp_threads[tsp_threads_iter]);
                }
            }
        }
        free(tsp_threads);
        free(tsp_threads_args);
        if (loop_res > 0) {
            break;
        }
    }
    return (loop_res > 0) ? 1 : 0;
}

该功能工作正常(至少对于我测试过的)。尽管如此,来自#C (freenode) 的一些人表示该解决方案过于复杂,并且由于 hddisk 上的并行读取,它可能表现不佳。

我想知道的:
  • 默认情况下,线程方法是否存在缺陷?
  • fseek() 这么慢吗?
  • 有没有办法以某种方式将文件映射到内存然后比较它们?

  • 后期编辑:

    今天我有一些时间,我听从了你的建议。你是对的,这个线程版本实际上比单线程版本性能更差,这都是因为硬盘上的并行读数。

    另一件事是我编写了一个使用 mmap() 的函数,并且到目前为止是最佳函数。该功能的最大缺点仍然是当文​​件变得非常大时它会失败。

    这是新的实现(一个非常粗暴和直接的代码):
    #include <errno.h>
    #include <fcntl.h>
    #include <pthread.h>
    #include <stdio.h>
    #include <stdlib.h>
    #include <string.h>
    #include <sys/mman.h>
    #include <sys/types.h>
    #include <sys/stat.h>
    #include <unistd.h>
    #include "dupf.h"
    
    /**
    * Safely assures that a file is opened. 
    * If cannot open file, the flow of the program is interrupted.
    * The error code returned is -1.
    **/
    FILE *safe_fopen(const char *fname, const char *mode)
    {
        FILE *f = NULL;
        f = fopen(fname, mode);
        if (f == NULL) {
            char emsg[1024];
            sprintf(emsg, "Cannot open file: %s\t", fname);
            perror(emsg);
            exit(-1);
        }
        return (f);
    }
    
    /**
    * Check if two files have the same size.
    * Returns:
    * -1    Error.
    * 0 If they have the same size.
    * 1 If the don't have the same size.
    **/
    int check_same_size(const char *f1_name, const char *f2_name, off_t *f1_size, off_t *f2_size)
    {
        struct stat f1_stat, f2_stat;
        if((f1_name == NULL) || (f2_name == NULL)){
            fprintf(stderr, "Invalid filename passed to function [check_same_size].\n");
            return (-1);
        }
        if((stat(f1_name, &f1_stat) != 0) || (stat(f2_name, &f2_stat) !=0)){
            fprintf(stderr, "Cannot apply stat. [check_same_size].\n");
            return (-1);
        }
        if(f1_size != NULL){
            *f1_size = f1_stat.st_size;
        }
        if(f2_size != NULL){
            *f2_size = f2_stat.st_size;
        }
        return (f1_stat.st_size == f2_stat.st_size) ? 0 : 1;
    }
    
    /**
    * Test if two files are duplicates.
    * Returns:
    * -1    Error.
    * 0 If they are duplicates.
    * 1 If they are not duplicates.
    **/
    int check_dup_plain(char *f1_name, char *f2_name, int block_size)
    {
        if ((f1_name == NULL) || (f2_name == NULL)){
            fprintf(stderr, "Invalid filename passed to function [check_dup_plain].\n");
            return (-1);
        }
        FILE *f1 = NULL, *f2 = NULL;
        char f1_buff[block_size], f2_buff[block_size];
        size_t rch1, rch2;
        if(check_same_size(f1_name, f2_name, NULL, NULL) == 1){
            return (1);
        }
        f1 = safe_fopen(f1_name, "r");
        f2 = safe_fopen(f2_name, "r");
        while(!feof(f1) && !feof(f2)){
            rch1 = fread(f1_buff, 1, block_size, f1);
            rch2 = fread(f2_buff, 1, block_size, f2);
            if(rch1 != rch2){
                fprintf(stderr, "Invalid reading from file. Cannot continue. [check_dup_plain].\n");
                return (-1);
            }
            while(rch1-->0){
                if(f1_buff[rch1] != f2_buff[rch1]){
                    return (1);
                }
            }
        }
        fclose(f1);
        fclose(f2);
        return (0);
    }
    
    /**
    * Test if two files are duplicates.
    * Returns:
    * -1    Error.
    * 0 If they are duplicates.
    * 1 If they are not duplicates.
    **/
    int check_dup_memmap(char *f1_name, char *f2_name)
    {
        struct stat f1_stat, f2_stat;
        char *f1_array = NULL, *f2_array = NULL;
        off_t f1_size, f2_size;
        int f1_des, f2_des, cont, res;
        if((f1_name == NULL) || (f2_name == NULL)){
            fprintf(stderr, "Invalid filename passed to function [check_dup_memmap].\n");
            return (-1);    
        }
        if(check_same_size(f1_name, f2_name, &f1_size, &f2_size) == 1){
            return (1);
        }
        f1_des = open(f1_name, O_RDONLY);
        f2_des = open(f2_name, O_RDONLY);
        if((f1_des == -1) || (f2_des == -1)){
            perror("Cannot open file");
            exit(-1);       
        }
        f1_array = mmap(0, f1_size * sizeof(*f1_array), PROT_READ, MAP_SHARED, f1_des, 0);
        if(f1_array == NULL){
            fprintf(stderr, "Cannot map file to memory [check_dup_memmap].\n");
            return (-1);
        }
        f2_array = mmap(0, f2_size * sizeof(*f2_array), PROT_READ, MAP_SHARED, f2_des, 0);
        if(f2_array == NULL){
            fprintf(stderr, "Cannot map file to memory [check_dup_memmap].\n");
            return (-1);
        }
        cont = f1_size;
        res = 0;
        while(cont-->0){
            if(f1_array[cont]!=f2_array[cont]){
                res = 1;
                break;
            }
        }
        munmap((void*) f1_array, f1_size * sizeof(*f1_array));
        munmap((void*) f2_array, f2_size * sizeof(*f2_array));
        return res;
    }
    
    int main(int argc, char *argv[])
    {
        printf("result: %d\n",check_dup_memmap("f2","f1"));
        return (0);
    }
    

    我现在计划通过重新添加线程功能来扩展此代码,但这次读取将在内存中。

    感谢您的回答。

    最佳答案

    限制因素是磁盘读取,它(假设两个文件都在同一个磁盘上)无论如何都会被序列化,所以我认为线程不会有太大帮助。

    关于c - 重复文件查找算法的建议(使用 C),我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/2662946/

    相关文章:

    c - 在 C 中将结构传递给回调时出现 SIGSEGV(段错误)

    c - 使用 fwrite 存储在文本文件中的数据在 C 中不可读

    php - 使用 PHP 将文件从一台服务器移动到另一台服务器的最佳方法是什么?

    javascript - .txt文件的脚本阅读栏

    c# - 如何使用 AutoMapper 为特定映射映射空值?

    c++ - C程序员开始写C++有哪些坏习惯?

    c - 将指针传递给具有 volatile 成员的结构作为函数参数

    ios - 快速对象映射

    Tomcat URL 将单个 servlet 映射到根级别

    c - GetSaveFileName()如何更新 "File name:"控件中的文件扩展名?