c++ - 嵌套 for 循环的奇怪性能问题

下面是完整的源代码，您只需将其复制粘贴到 Visual Studio 中即可轻松重现。

#include <Windows.h>

#include <algorithm>
#include <vector>
#include <iostream>
#include <sstream>

LARGE_INTEGER gFreq;

struct CProfileData;

// Yes, we map the pointer itself not the string, for performance reasons
std::vector<CProfileData*> gProfileData;

// simulate a draw buffer access to avoid CBlock::Draw being optimized away
float gDrawBuffer = 0;

struct CTimer
{
    CTimer()
    {
        Reset();
    }

    size_t GetElapsedMicro()
    {
        LARGE_INTEGER now;
        ::QueryPerformanceCounter(&now);
        return (1000000 * (now.QuadPart - m_timer.QuadPart)) / gFreq.QuadPart;
    }

    inline void Reset()
    {
        ::QueryPerformanceCounter(&m_timer);
    }

    LARGE_INTEGER m_timer;
};

struct CProfileData
{
    CProfileData() : m_hitCount(0), m_totalTime(0), m_minTime(-1),
        m_maxTime(0), m_name(NULL)
    {
        gProfileData.push_back(this);
    }

    size_t m_totalTime;
    size_t m_minTime;
    size_t m_maxTime;
    size_t m_hitCount;
    const char * m_name;
};

class CSimpleProfiler
{
public:
    CSimpleProfiler(const char * aLocationName, CProfileData * aData)
        : m_location(aLocationName), m_data(aData)
    {
        ::QueryPerformanceCounter(&m_clock);
    }

    ~CSimpleProfiler()
    {
        CProfileData & data = *m_data;
        data.m_name = m_location;
        ++data.m_hitCount;


        LARGE_INTEGER now;
        ::QueryPerformanceCounter(&now);

        size_t elapsed = (1000000 * (now.QuadPart - m_clock.QuadPart)) / gFreq.QuadPart;
        data.m_totalTime += elapsed;

        elapsed < data.m_minTime ? data.m_minTime = elapsed : true;
        elapsed > data.m_maxTime ? data.m_maxTime = elapsed : true;
    }

    static void PrintAll()
    {
        std::stringstream str;
        str.width(20);
        str << "Location";
        str.width(15);
        str << "Total time";
        str.width(15);
        str << "Average time";
        str.width(15);
        str << "Hit count";
        str.width(15);
        str << "Min";
        str.width(15);
        str << "Max" << std::endl;

        ::OutputDebugStringA(str.str().c_str());

        for (auto i = gProfileData.begin(); i != gProfileData.end(); ++i)
        {
            CProfileData & data = **i;
            std::stringstream str;
            str.width(20);
            str << data.m_name;
            str.width(15);
            str << data.m_totalTime;
            str.width(15);
            str << data.m_totalTime / (float)data.m_hitCount;
            str.width(15);
            str << data.m_hitCount;
            str.width(15);
            str << data.m_minTime;
            str.width(15);
            str << data.m_maxTime << std::endl;

            ::OutputDebugStringA(str.str().c_str());
        }
    }

    static void Clear()
    {
        for (auto i = gProfileData.begin(); i != gProfileData.end(); ++i)
        {
            (*i)->m_totalTime = 0;
            (*i)->m_minTime = 0;
            (*i)->m_maxTime = 0;
            (*i)->m_hitCount = 0;
        }
    }

private:
    LARGE_INTEGER m_clock;
    const char * m_location;
    CProfileData * m_data;
};


#define PROFILING_ENABLED

#ifdef PROFILING_ENABLED
#define SIMPLE_PROFILE \
    static CProfileData pdata ## __LINE__; \
    CSimpleProfiler p ## __LINE__(__FUNCTION__, & pdata ## __LINE__)

#define SIMPLE_PROFILE_WITH_NAME(Name) \
    static CProfileData pdata ## __LINE__; \
    CSimpleProfiler p ## __LINE__(Name, & pdata ## __LINE__)
#else
#define SIMPLE_PROFILE __noop
#define SIMPLE_PROFILE_WITH_NAME(Name) __noop
#endif


void InvalidateL1Cache()
{
    const int size = 256 * 1024; 
    static char *c = (char *)malloc(size);
    for (int i = 0; i < 0x0fff; i++)
        for (int j = 0; j < size; j++)
            c[j] = i*j;
}

int _tmain(int argc, _TCHAR* argv[])
{
    ::QueryPerformanceFrequency(&gFreq);

    LARGE_INTEGER pc;
    ::QueryPerformanceCounter(&pc);

    struct CBlock
    {
        float x;
        float y;

        void Draw(float aBlend)
        {   
            for (size_t i = 0; i < 100; ++i )
                gDrawBuffer += aBlend;
        }
    };


    typedef std::vector<std::vector<CBlock>> Layer;
    typedef std::vector<Layer> Layers;
    Layers mBlocks;

    // populate with dummy data;
    mBlocks.push_back(Layer());
    Layer & layer = mBlocks.back();
    layer.resize(109);

    srand(0); // for reprodicibility (determinism)
    for (auto i = layer.begin(); i != layer.end(); ++i)
    {
        i->resize(25 + rand() % 10 - 5);
    }

    // end populating dummy data

    while (1)
    {
        CSimpleProfiler::Clear();

        float aBlend = 1.f / (rand() % 100);
        {
            for (auto i = mBlocks.begin(); i != mBlocks.end(); ++i)
            {
                for (auto j = i->begin(); j != i->end(); ++j)
                {

                    CTimer t;

                    {
                        SIMPLE_PROFILE_WITH_NAME("Main_Draw_3");
                        for (auto blockIt = j->begin(); blockIt != j->end();)
                        {
                            CBlock * b = nullptr;
                            {
                                b = &*blockIt;
                            }
                        {
                            b->Draw(aBlend);
                        }
                        {
                            ++blockIt;
                        }
                        }
                    }

                    if (t.GetElapsedMicro() > 1000)
                    {
                        ::OutputDebugStringA("SLOWDOWN!\n");
                        CSimpleProfiler::PrintAll();
                    }
                }
            }
        }
    }

    return 0;
}

我不时得到以下分析，以微秒表示:

SLOWDOWN!
            Location     Total time   Average time      Hit count            Min            Max
         Main_Draw_3           2047        36.5536             56              0           1040

这会不时出现峰值。通常，Main_Draw_3 block 需要 100 微秒才能完成，但有时会达到 1000 微秒(Max 列)。是什么原因造成的？

我知道缓存未命中可能起到一定作用，但在这种情况下真的如此吗？...这里发生了什么，我该如何缓解这种情况？

更多信息:

编译器 VS 2013，使用 Maximize Speed (/O2) 编译

最佳答案

我觉得可能有两个问题:

您是否在进行优化编译？旗帜是什么？
也许您可以增加样本量(例如，在一次分析运行中运行此代码十次(或一百次，或一千次等))。原因是如果样本量小，标准差就很高

关于c++ - 嵌套 for 循环的奇怪性能问题，我们在Stack Overflow上找到一个类似的问题： https://stackoverflow.com/questions/26370523/

c++ - 嵌套 for 循环的奇怪性能问题

上一篇：c++ - 错误消息与命名右值引用混淆

下一篇：c++ - visual c++ 2013 是否有使用旧方言的选项(如 gcc 的 -std=c++98)？