java - 每晚构建 - 需要一次处理一个而不是列表

标签 java sql-server spring-boot amazon-s3

有一个夜间更新流程,从 Amazon S3 存储桶检索文档,将其与契约(Contract)列表进行比较,并保存字节数据。我希望它一次抓取并处理一个文件,而不是作为一个列表。问题与列表中的内存有关,如果当天晚上文档较多,则由于列表太大而导致进程出错。


import com.amazonaws.regions.Regions;
import com.amazonaws.services.s3.AmazonS3;
import com.amazonaws.services.s3.AmazonS3ClientBuilder;
import com.amazonaws.services.s3.model.*;
import com.amazonaws.util.IOUtils;
import com.duke.contractsystembackend.models.ContractDocument;
import com.duke.contractsystembackend.models.S3ContractDocument;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.springframework.beans.factory.annotation.Autowired;

import java.io.IOException;
import java.util.ArrayList;
import java.util.List;

public class S3Reader {

    @Autowired
    S3Service s3Service;

    private String bucketName;
    private Regions region = Regions.US_EAST_1;

    private final AmazonS3 s3;
    private static final Logger log = LoggerFactory.getLogger(S3Reader.class);

    /**
     * @param bucketName
     */
    public S3Reader(String bucketName) {
        this.bucketName = bucketName;
        this.s3 = AmazonS3ClientBuilder.standard().withRegion(this.region).build();
    }

    /**
     * @param bucketName
     */
    public S3Reader(String bucketName, AmazonS3 s3) {
        this.bucketName = bucketName;
        this.s3 = s3;
    }

    /**
     *
     * @param buckets
     * @param contractDocumentList
     * @return
     */
    public List<S3ContractDocument> traverseBucket(String[] buckets, List<ContractDocument> contractDocumentList, AmazonS3 s3) {

        List<S3ContractDocument> list = new ArrayList<S3ContractDocument>();

         contractDocumentList.stream().forEach( cd -> {

            for (String bucket : buckets) {

                String key = String.format("%s/%s", bucket, cd.getFileName());
                log.info("- Looking for prefix: " + key);

                ListObjectsV2Request req = new ListObjectsV2Request()
                        .withBucketName(bucketName)
                        .withPrefix(key)
                        .withMaxKeys(buckets.length);
                ListObjectsV2Result result = s3.listObjectsV2(req);
                List<S3ObjectSummary> objects = result.getObjectSummaries();

                for (S3ObjectSummary os : objects) {
                    log.info("* " + os.getKey());

                    try {
                        S3Object object = s3.getObject(new GetObjectRequest(bucketName, key));
                        S3ContractDocument di = new S3ContractDocument();
                        cd.setFileData(getObjectContentAsByte(object));
                        di.setKey(os.getKey());
                        di.setBucketName(bucketName);
                        di.setContractDocument(cd);
                        list.add(di);
                    } catch (Exception e) {
                        log.error("Error trying to get bytes from s3's inputStream: " + e.getMessage());
                        e.printStackTrace();
                        continue;
                    }

                }
            }
        });
        return list;
    }

    /**
     * Convert content to byte array
     * @param s3Object
     * @return
     * @throws InterruptedException
     * @throws IOException
     */
    protected byte[] getObjectContentAsByte(S3Object s3Object) throws IOException {
        return IOUtils.toByteArray(s3Object.getObjectContent());
    }
}

import com.amazonaws.services.s3.AmazonS3;
import com.duke.contractsystembackend.exceptions.FileStorageException;
import com.duke.contractsystembackend.models.ContractDocument;
import com.duke.contractsystembackend.models.S3ContractDocument;
import com.duke.contractsystembackend.services.*;
import lombok.extern.slf4j.Slf4j;
import org.apache.commons.collections4.CollectionUtils;
import org.springframework.beans.factory.annotation.Autowired;
import org.springframework.beans.factory.annotation.Value;
import org.springframework.scheduling.annotation.Async;
import org.springframework.scheduling.annotation.Scheduled;
import org.springframework.stereotype.Component;
import org.springframework.stereotype.Service;
import org.springframework.transaction.annotation.Transactional;

import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;

@Slf4j
@Component
public class NightlyMaximoUpdate {

    @Autowired
    S3Service s3Service;

    @Autowired
    ContractDocumentStorageService contractDocumentStorageService;

    @Autowired
    OCRSubmissionService ocrSubmissionService;

    @Autowired
    ContractService contractService;

    @Value("${S3_BUCKET_NAME}")
    protected String BUCKET_NAME;

    @Value("${SEND_BATCH:TRUE}")
    protected String shouldSend;


    @Transactional
    @Scheduled(cron = "${batch.job.cron}")
    public void nightlyMaximoTransfer(){
        // code to tell what environment you're in - let's set this to trigger a cdc job at 11 and an adc job at 3 (or something along those lines).
       if(System.getenv("application_env_json").contains("cdc.duke-energy.com")){

       }

        if (this.shouldSend.equals("TRUE")) {
            AmazonS3 s3Client = s3Service.getS3Client();

            List<ContractDocument> contractDocumentList = contractService.getAllByFileDataIsNull();
            System.out.println("returned " + contractDocumentList.size() + " elements");
            contractDocumentList.forEach(doc -> System.out.println(doc.getId()));
            // Defines the buckets to read from
            String[] folders = new String[]{
                    "sc_contracts",
                    "attachments",
                    "diagrams",
                    "sc_purchasing",
            };

            // Initialize S3 reader
            S3Reader reader = new S3Reader(BUCKET_NAME, s3Service.getS3Client());
            List<S3ContractDocument> s3ContractDocuments = reader.traverseBucket(folders, contractDocumentList, s3Client);

            System.out.println("Found " + s3ContractDocuments.size() + " elements");

            // Diffs existing between SSMS and S3
            s3Service.diffList(contractDocumentList, s3ContractDocuments);

            for (S3ContractDocument s3cd : s3ContractDocuments) {

                ContractDocument newDocument = null;

                try {
                    newDocument = contractDocumentStorageService.storeS3FIle(s3cd.getContractDocument().getFileData(),
                            s3cd.getContractDocument().getId());
                } catch (FileStorageException e) {
                    e.printStackTrace();
                }
                Boolean success = false;
                if (newDocument.getFileType().equals("application/pdf")) {
                    success = this.ocrSubmissionService.SubmitContractForOCR(s3cd.getContractDocument().getFileData(), newDocument.getId());
                } else {
                    success = true;
                }
                if (!success) {
                    log.info("OCR not successful " + newDocument.getId());
                } else {
                    log.info("OCR Successful " + newDocument.getId());
                }
            }

        }

    }


}

最佳答案

AWS 开发工具包通常有一些通用的方法来对结果进行分页。对于 JS,有 hasNextPage() 和 nextPage() 方法。

您可以在此处查看 java 的示例:https://docs.amazonaws.cn/en_us/sdk-for-java/v2/developer-guide/examples-pagination.html

关于java - 每晚构建 - 需要一次处理一个而不是列表,我们在Stack Overflow上找到一个类似的问题: https://stackoverflow.com/questions/60396712/

相关文章:

java - NamedParameterJdbcTemplate 设置连接获取大小属性

java - 如果存在重复元素,如何获取列表中元素的索引

java - JAVA中关联类的实现

spring-boot - Spring Cloud Gateway 和 TokenRelay 过滤器

java - Hibernate 二级缓存使用 redis 用于多个容器

java - 如果用户在数据库中没有任何角色,则检查用户角色时 JAVA 出错

sql-server - 使用 ASP.NET Core 3.0 和 SQL Server 2019+ 时,我什么时候应该使用 MultipleActiveResultSets=True?

mysql - 语法错误: Concatenate - Derived Column - 2 column + a constant string

sql-server - SQL Azure 查询性能 - 即使经过调整的查询也非常慢

java - 如何在 Spring Boot 应用程序中将数据库架构更改从源数据库同步到目标数据库