axmol/thirdparty/bullet/LinearMath/TaskScheduler/btTaskScheduler.cpp


#include "LinearMath/btMinMax.h"
#include "LinearMath/btAlignedObjectArray.h"
#include "LinearMath/btThreads.h"
#include "LinearMath/btQuickprof.h"
#include <stdio.h>
#include <algorithm>

#if BT_THREADSAFE

#include "btThreadSupportInterface.h"

#if defined(_WIN32)

#define WIN32_LEAN_AND_MEAN

#include <windows.h>

#endif

typedef unsigned long long btU64;
static const int kCacheLineSize = 64;

void btSpinPause()
{
#if defined(_WIN32)
	YieldProcessor();
#endif
}

struct WorkerThreadStatus
{
	enum Type
	{
		kInvalid,
		kWaitingForWork,
		kWorking,
		kSleeping,
	};
};

ATTRIBUTE_ALIGNED64(class)
WorkerThreadDirectives
{
	static const int kMaxThreadCount = BT_MAX_THREAD_COUNT;
	// directives for all worker threads packed into a single cacheline
	char m_threadDirs[kMaxThreadCount];

public:
	enum Type
	{
		kInvalid,
		kGoToSleep,         // go to sleep
		kStayAwakeButIdle,  // wait for not checking job queue
		kScanForJobs,       // actively scan job queue for jobs
	};
	WorkerThreadDirectives()
	{
		for (int i = 0; i < kMaxThreadCount; ++i)
		{
			m_threadDirs[i] = 0;
		}
	}

	Type getDirective(int threadId)
	{
		btAssert(threadId < kMaxThreadCount);
		return static_cast<Type>(m_threadDirs[threadId]);
	}

	void setDirectiveByRange(int threadBegin, int threadEnd, Type dir)
	{
		btAssert(threadBegin < threadEnd);
		btAssert(threadEnd <= kMaxThreadCount);
		char dirChar = static_cast<char>(dir);
		for (int i = threadBegin; i < threadEnd; ++i)
		{
			m_threadDirs[i] = dirChar;
		}
	}
};

class JobQueue;

ATTRIBUTE_ALIGNED64(struct)
ThreadLocalStorage
{
	int m_threadId;
	WorkerThreadStatus::Type m_status;
	int m_numJobsFinished;
	btSpinMutex m_mutex;
	btScalar m_sumResult;
	WorkerThreadDirectives* m_directive;
	JobQueue* m_queue;
	btClock* m_clock;
	unsigned int m_cooldownTime;
};

struct IJob
{
	virtual void executeJob(int threadId) = 0;
};

class ParallelForJob : public IJob
{
	const btIParallelForBody* m_body;
	int m_begin;
	int m_end;

public:
	ParallelForJob(int iBegin, int iEnd, const btIParallelForBody& body)
	{
		m_body = &body;
		m_begin = iBegin;
		m_end = iEnd;
	}
	virtual void executeJob(int threadId) BT_OVERRIDE
	{
		BT_PROFILE("executeJob");

		// call the functor body to do the work
		m_body->forLoop(m_begin, m_end);
	}
};

class ParallelSumJob : public IJob
{
	const btIParallelSumBody* m_body;
	ThreadLocalStorage* m_threadLocalStoreArray;
	int m_begin;
	int m_end;

public:
	ParallelSumJob(int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalStorage* tls)
	{
		m_body = &body;
		m_threadLocalStoreArray = tls;
		m_begin = iBegin;
		m_end = iEnd;
	}
	virtual void executeJob(int threadId) BT_OVERRIDE
	{
		BT_PROFILE("executeJob");

		// call the functor body to do the work
		btScalar val = m_body->sumLoop(m_begin, m_end);
#if BT_PARALLEL_SUM_DETERMINISTISM
		// by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision)
		const float TRUNC_SCALE = float(1 << 19);
		val = floor(val * TRUNC_SCALE + 0.5f) / TRUNC_SCALE;  // truncate some bits
#endif
		m_threadLocalStoreArray[threadId].m_sumResult += val;
	}
};

ATTRIBUTE_ALIGNED64(class)
JobQueue
{
	btThreadSupportInterface* m_threadSupport;
	btCriticalSection* m_queueLock;
	btSpinMutex m_mutex;

	btAlignedObjectArray<IJob*> m_jobQueue;
	char* m_jobMem;
	int m_jobMemSize;
	bool m_queueIsEmpty;
	int m_tailIndex;
	int m_headIndex;
	int m_allocSize;
	bool m_useSpinMutex;
	btAlignedObjectArray<JobQueue*> m_neighborContexts;
	char m_cachePadding[kCacheLineSize];  // prevent false sharing

	void freeJobMem()
	{
		if (m_jobMem)
		{
			// free old
			btAlignedFree(m_jobMem);
			m_jobMem = NULL;
		}
	}
	void resizeJobMem(int newSize)
	{
		if (newSize > m_jobMemSize)
		{
			freeJobMem();
			m_jobMem = static_cast<char*>(btAlignedAlloc(newSize, kCacheLineSize));
			m_jobMemSize = newSize;
		}
	}

public:
	JobQueue()
	{
		m_jobMem = NULL;
		m_jobMemSize = 0;
		m_threadSupport = NULL;
		m_queueLock = NULL;
		m_headIndex = 0;
		m_tailIndex = 0;
		m_useSpinMutex = false;
	}
	~JobQueue()
	{
		exit();
	}
	void exit()
	{
		freeJobMem();
		if (m_queueLock && m_threadSupport)
		{
			m_threadSupport->deleteCriticalSection(m_queueLock);
			m_queueLock = NULL;
			m_threadSupport = 0;
		}
	}

	void init(btThreadSupportInterface * threadSup, btAlignedObjectArray<JobQueue> * contextArray)
	{
		m_threadSupport = threadSup;
		if (threadSup)
		{
			m_queueLock = m_threadSupport->createCriticalSection();
		}
		setupJobStealing(contextArray, contextArray->size());
	}
	void setupJobStealing(btAlignedObjectArray<JobQueue> * contextArray, int numActiveContexts)
	{
		btAlignedObjectArray<JobQueue>& contexts = *contextArray;
		int selfIndex = 0;
		for (int i = 0; i < contexts.size(); ++i)
		{
			if (this == &contexts[i])
			{
				selfIndex = i;
				break;
			}
		}
		int numNeighbors = btMin(2, contexts.size() - 1);
		int neighborOffsets[] = {-1, 1, -2, 2, -3, 3};
		int numOffsets = sizeof(neighborOffsets) / sizeof(neighborOffsets[0]);
		m_neighborContexts.reserve(numNeighbors);
		m_neighborContexts.resizeNoInitialize(0);
		for (int i = 0; i < numOffsets && m_neighborContexts.size() < numNeighbors; i++)
		{
			int neighborIndex = selfIndex + neighborOffsets[i];
			if (neighborIndex >= 0 && neighborIndex < numActiveContexts)
			{
				m_neighborContexts.push_back(&contexts[neighborIndex]);
			}
		}
	}

	bool isQueueEmpty() const { return m_queueIsEmpty; }
	void lockQueue()
	{
		if (m_useSpinMutex)
		{
			m_mutex.lock();
		}
		else
		{
			m_queueLock->lock();
		}
	}
	void unlockQueue()
	{
		if (m_useSpinMutex)
		{
			m_mutex.unlock();
		}
		else
		{
			m_queueLock->unlock();
		}
	}
	void clearQueue(int jobCount, int jobSize)
	{
		lockQueue();
		m_headIndex = 0;
		m_tailIndex = 0;
		m_allocSize = 0;
		m_queueIsEmpty = true;
		int jobBufSize = jobSize * jobCount;
		// make sure we have enough memory allocated to store jobs
		if (jobBufSize > m_jobMemSize)
		{
			resizeJobMem(jobBufSize);
		}
		// make sure job queue is big enough
		if (jobCount > m_jobQueue.capacity())
		{
			m_jobQueue.reserve(jobCount);
		}
		unlockQueue();
		m_jobQueue.resizeNoInitialize(0);
	}
	void* allocJobMem(int jobSize)
	{
		btAssert(m_jobMemSize >= (m_allocSize + jobSize));
		void* jobMem = &m_jobMem[m_allocSize];
		m_allocSize += jobSize;
		return jobMem;
	}
	void submitJob(IJob * job)
	{
		btAssert(reinterpret_cast<char*>(job) >= &m_jobMem[0] && reinterpret_cast<char*>(job) < &m_jobMem[0] + m_allocSize);
		m_jobQueue.push_back(job);
		lockQueue();
		m_tailIndex++;
		m_queueIsEmpty = false;
		unlockQueue();
	}
	IJob* consumeJobFromOwnQueue()
	{
		if (m_queueIsEmpty)
		{
			// lock free path. even if this is taken erroneously it isn't harmful
			return NULL;
		}
		IJob* job = NULL;
		lockQueue();
		if (!m_queueIsEmpty)
		{
			job = m_jobQueue[m_headIndex++];
			btAssert(reinterpret_cast<char*>(job) >= &m_jobMem[0] && reinterpret_cast<char*>(job) < &m_jobMem[0] + m_allocSize);
			if (m_headIndex == m_tailIndex)
			{
				m_queueIsEmpty = true;
			}
		}
		unlockQueue();
		return job;
	}
	IJob* consumeJob()
	{
		if (IJob* job = consumeJobFromOwnQueue())
		{
			return job;
		}
		// own queue is empty, try to steal from neighbor
		for (int i = 0; i < m_neighborContexts.size(); ++i)
		{
			JobQueue* otherContext = m_neighborContexts[i];
			if (IJob* job = otherContext->consumeJobFromOwnQueue())
			{
				return job;
			}
		}
		return NULL;
	}
};

static void WorkerThreadFunc(void* userPtr)
{
	BT_PROFILE("WorkerThreadFunc");
	ThreadLocalStorage* localStorage = (ThreadLocalStorage*)userPtr;
	JobQueue* jobQueue = localStorage->m_queue;

	bool shouldSleep = false;
	int threadId = localStorage->m_threadId;
	while (!shouldSleep)
	{
		// do work
		localStorage->m_mutex.lock();
		while (IJob* job = jobQueue->consumeJob())
		{
			localStorage->m_status = WorkerThreadStatus::kWorking;
			job->executeJob(threadId);
			localStorage->m_numJobsFinished++;
		}
		localStorage->m_status = WorkerThreadStatus::kWaitingForWork;
		localStorage->m_mutex.unlock();
		btU64 clockStart = localStorage->m_clock->getTimeMicroseconds();
		// while queue is empty,
		while (jobQueue->isQueueEmpty())
		{
			// todo: spin wait a bit to avoid hammering the empty queue
			btSpinPause();
			if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kGoToSleep)
			{
				shouldSleep = true;
				break;
			}
			// if jobs are incoming,
			if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kScanForJobs)
			{
				clockStart = localStorage->m_clock->getTimeMicroseconds();  // reset clock
			}
			else
			{
				for (int i = 0; i < 50; ++i)
				{
					btSpinPause();
					btSpinPause();
					btSpinPause();
					btSpinPause();
					if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kScanForJobs || !jobQueue->isQueueEmpty())
					{
						break;
					}
				}
				// if no jobs incoming and queue has been empty for the cooldown time, sleep
				btU64 timeElapsed = localStorage->m_clock->getTimeMicroseconds() - clockStart;
				if (timeElapsed > localStorage->m_cooldownTime)
				{
					shouldSleep = true;
					break;
				}
			}
		}
	}
	{
		BT_PROFILE("sleep");
		// go sleep
		localStorage->m_mutex.lock();
		localStorage->m_status = WorkerThreadStatus::kSleeping;
		localStorage->m_mutex.unlock();
	}
}

class btTaskSchedulerDefault : public btITaskScheduler
{
	btThreadSupportInterface* m_threadSupport;
	WorkerThreadDirectives* m_workerDirective;
	btAlignedObjectArray<JobQueue> m_jobQueues;
	btAlignedObjectArray<JobQueue*> m_perThreadJobQueues;
	btAlignedObjectArray<ThreadLocalStorage> m_threadLocalStorage;
	btSpinMutex m_antiNestingLock;  // prevent nested parallel-for
	btClock m_clock;
	int m_numThreads;
	int m_numWorkerThreads;
	int m_numActiveJobQueues;
	int m_maxNumThreads;
	int m_numJobs;
	static const int kFirstWorkerThreadId = 1;

public:
	btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport")
	{
		m_threadSupport = NULL;
		m_workerDirective = NULL;
	}

	virtual ~btTaskSchedulerDefault()
	{
		waitForWorkersToSleep();

		for (int i = 0; i < m_jobQueues.size(); ++i)
		{
			m_jobQueues[i].exit();
		}

		if (m_threadSupport)
		{
			delete m_threadSupport;
			m_threadSupport = NULL;
		}
		if (m_workerDirective)
		{
			btAlignedFree(m_workerDirective);
			m_workerDirective = NULL;
		}
	}

	void init()
	{
		btThreadSupportInterface::ConstructionInfo constructionInfo("TaskScheduler", WorkerThreadFunc);
		m_threadSupport = btThreadSupportInterface::create(constructionInfo);
		m_workerDirective = static_cast<WorkerThreadDirectives*>(btAlignedAlloc(sizeof(*m_workerDirective), 64));

		m_numWorkerThreads = m_threadSupport->getNumWorkerThreads();
		m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1;
		m_numThreads = m_maxNumThreads;
		// ideal to have one job queue for each physical processor (except for the main thread which needs no queue)
		int numThreadsPerQueue = m_threadSupport->getLogicalToPhysicalCoreRatio();
		int numJobQueues = (numThreadsPerQueue == 1) ? (m_maxNumThreads - 1) : (m_maxNumThreads / numThreadsPerQueue);
		m_jobQueues.resize(numJobQueues);
		m_numActiveJobQueues = numJobQueues;
		for (int i = 0; i < m_jobQueues.size(); ++i)
		{
			m_jobQueues[i].init(m_threadSupport, &m_jobQueues);
		}
		m_perThreadJobQueues.resize(m_numThreads);
		for (int i = 0; i < m_numThreads; i++)
		{
			JobQueue* jq = NULL;
			// only worker threads get a job queue
			if (i > 0)
			{
				if (numThreadsPerQueue == 1)
				{
					// one queue per worker thread
					jq = &m_jobQueues[i - kFirstWorkerThreadId];
				}
				else
				{
					// 2 threads share each queue
					jq = &m_jobQueues[i / numThreadsPerQueue];
				}
			}
			m_perThreadJobQueues[i] = jq;
		}
		m_threadLocalStorage.resize(m_numThreads);
		for (int i = 0; i < m_numThreads; i++)
		{
			ThreadLocalStorage& storage = m_threadLocalStorage[i];
			storage.m_threadId = i;
			storage.m_directive = m_workerDirective;
			storage.m_status = WorkerThreadStatus::kSleeping;
			storage.m_cooldownTime = 100;  // 100 microseconds, threads go to sleep after this long if they have nothing to do
			storage.m_clock = &m_clock;
			storage.m_queue = m_perThreadJobQueues[i];
		}
		setWorkerDirectives(WorkerThreadDirectives::kGoToSleep);  // no work for them yet
		setNumThreads(m_threadSupport->getCacheFriendlyNumThreads());
	}

	void setWorkerDirectives(WorkerThreadDirectives::Type dir)
	{
		m_workerDirective->setDirectiveByRange(kFirstWorkerThreadId, m_numThreads, dir);
	}

	virtual int getMaxNumThreads() const BT_OVERRIDE
	{
		return m_maxNumThreads;
	}

	virtual int getNumThreads() const BT_OVERRIDE
	{
		return m_numThreads;
	}

	virtual void setNumThreads(int numThreads) BT_OVERRIDE
	{
		m_numThreads = btMax(btMin(numThreads, int(m_maxNumThreads)), 1);
		m_numWorkerThreads = m_numThreads - 1;
		m_numActiveJobQueues = 0;
		// if there is at least 1 worker,
		if (m_numWorkerThreads > 0)
		{
			// re-setup job stealing between queues to avoid attempting to steal from an inactive job queue
			JobQueue* lastActiveContext = m_perThreadJobQueues[m_numThreads - 1];
			int iLastActiveContext = lastActiveContext - &m_jobQueues[0];
			m_numActiveJobQueues = iLastActiveContext + 1;
			for (int i = 0; i < m_jobQueues.size(); ++i)
			{
				m_jobQueues[i].setupJobStealing(&m_jobQueues, m_numActiveJobQueues);
			}
		}
		m_workerDirective->setDirectiveByRange(m_numThreads, BT_MAX_THREAD_COUNT, WorkerThreadDirectives::kGoToSleep);
	}

	void waitJobs()
	{
		BT_PROFILE("waitJobs");
		// have the main thread work until the job queues are empty
		int numMainThreadJobsFinished = 0;
		for (int i = 0; i < m_numActiveJobQueues; ++i)
		{
			while (IJob* job = m_jobQueues[i].consumeJob())
			{
				job->executeJob(0);
				numMainThreadJobsFinished++;
			}
		}

		// done with jobs for now, tell workers to rest (but not sleep)
		setWorkerDirectives(WorkerThreadDirectives::kStayAwakeButIdle);

		btU64 clockStart = m_clock.getTimeMicroseconds();
		// wait for workers to finish any jobs in progress
		while (true)
		{
			int numWorkerJobsFinished = 0;
			for (int iThread = kFirstWorkerThreadId; iThread < m_numThreads; ++iThread)
			{
				ThreadLocalStorage* storage = &m_threadLocalStorage[iThread];
				storage->m_mutex.lock();
				numWorkerJobsFinished += storage->m_numJobsFinished;
				storage->m_mutex.unlock();
			}
			if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs)
			{
				break;
			}
			btU64 timeElapsed = m_clock.getTimeMicroseconds() - clockStart;
			btAssert(timeElapsed < 1000);
			if (timeElapsed > 100000)
			{
				break;
			}
			btSpinPause();
		}
	}

	void wakeWorkers(int numWorkersToWake)
	{
		BT_PROFILE("wakeWorkers");
		btAssert(m_workerDirective->getDirective(1) == WorkerThreadDirectives::kScanForJobs);
		int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads);
		int numActiveWorkers = 0;
		for (int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker)
		{
			// note this count of active workers is not necessarily totally reliable, because a worker thread could be
			// just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare.
			ThreadLocalStorage& storage = m_threadLocalStorage[kFirstWorkerThreadId + iWorker];
			if (storage.m_status != WorkerThreadStatus::kSleeping)
			{
				numActiveWorkers++;
			}
		}
		for (int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker)
		{
			ThreadLocalStorage& storage = m_threadLocalStorage[kFirstWorkerThreadId + iWorker];
			if (storage.m_status == WorkerThreadStatus::kSleeping)
			{
				m_threadSupport->runTask(iWorker, &storage);
				numActiveWorkers++;
			}
		}
	}

	void waitForWorkersToSleep()
	{
		BT_PROFILE("waitForWorkersToSleep");
		setWorkerDirectives(WorkerThreadDirectives::kGoToSleep);
		m_threadSupport->waitForAllTasks();
		for (int i = kFirstWorkerThreadId; i < m_numThreads; i++)
		{
			ThreadLocalStorage& storage = m_threadLocalStorage[i];
			btAssert(storage.m_status == WorkerThreadStatus::kSleeping);
		}
	}

	virtual void sleepWorkerThreadsHint() BT_OVERRIDE
	{
		BT_PROFILE("sleepWorkerThreadsHint");
		// hint the task scheduler that we may not be using these threads for a little while
		setWorkerDirectives(WorkerThreadDirectives::kGoToSleep);
	}

	void prepareWorkerThreads()
	{
		for (int i = kFirstWorkerThreadId; i < m_numThreads; ++i)
		{
			ThreadLocalStorage& storage = m_threadLocalStorage[i];
			storage.m_mutex.lock();
			storage.m_numJobsFinished = 0;
			storage.m_mutex.unlock();
		}
		setWorkerDirectives(WorkerThreadDirectives::kScanForJobs);
	}

	virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE
	{
		BT_PROFILE("parallelFor_ThreadSupport");
		btAssert(iEnd >= iBegin);
		btAssert(grainSize >= 1);
		int iterationCount = iEnd - iBegin;
		if (iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock())
		{
			typedef ParallelForJob JobType;
			int jobCount = (iterationCount + grainSize - 1) / grainSize;
			m_numJobs = jobCount;
			btAssert(jobCount >= 2);  // need more than one job for multithreading
			int jobSize = sizeof(JobType);

			for (int i = 0; i < m_numActiveJobQueues; ++i)
			{
				m_jobQueues[i].clearQueue(jobCount, jobSize);
			}
			// prepare worker threads for incoming work
			prepareWorkerThreads();
			// submit all of the jobs
			int iJob = 0;
			int iThread = kFirstWorkerThreadId;  // first worker thread
			for (int i = iBegin; i < iEnd; i += grainSize)
			{
				btAssert(iJob < jobCount);
				int iE = btMin(i + grainSize, iEnd);
				JobQueue* jq = m_perThreadJobQueues[iThread];
				btAssert(jq);
				btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues);
				void* jobMem = jq->allocJobMem(jobSize);
				JobType* job = new (jobMem) ParallelForJob(i, iE, body);  // placement new
				jq->submitJob(job);
				iJob++;
				iThread++;
				if (iThread >= m_numThreads)
				{
					iThread = kFirstWorkerThreadId;  // first worker thread
				}
			}
			wakeWorkers(jobCount - 1);

			// put the main thread to work on emptying the job queue and then wait for all workers to finish
			waitJobs();
			m_antiNestingLock.unlock();
		}
		else
		{
			BT_PROFILE("parallelFor_mainThread");
			// just run on main thread
			body.forLoop(iBegin, iEnd);
		}
	}
	virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE
	{
		BT_PROFILE("parallelSum_ThreadSupport");
		btAssert(iEnd >= iBegin);
		btAssert(grainSize >= 1);
		int iterationCount = iEnd - iBegin;
		if (iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock())
		{
			typedef ParallelSumJob JobType;
			int jobCount = (iterationCount + grainSize - 1) / grainSize;
			m_numJobs = jobCount;
			btAssert(jobCount >= 2);  // need more than one job for multithreading
			int jobSize = sizeof(JobType);
			for (int i = 0; i < m_numActiveJobQueues; ++i)
			{
				m_jobQueues[i].clearQueue(jobCount, jobSize);
			}

			// initialize summation
			for (int iThread = 0; iThread < m_numThreads; ++iThread)
			{
				m_threadLocalStorage[iThread].m_sumResult = btScalar(0);
			}

			// prepare worker threads for incoming work
			prepareWorkerThreads();
			// submit all of the jobs
			int iJob = 0;
			int iThread = kFirstWorkerThreadId;  // first worker thread
			for (int i = iBegin; i < iEnd; i += grainSize)
			{
				btAssert(iJob < jobCount);
				int iE = btMin(i + grainSize, iEnd);
				JobQueue* jq = m_perThreadJobQueues[iThread];
				btAssert(jq);
				btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues);
				void* jobMem = jq->allocJobMem(jobSize);
				JobType* job = new (jobMem) ParallelSumJob(i, iE, body, &m_threadLocalStorage[0]);  // placement new
				jq->submitJob(job);
				iJob++;
				iThread++;
				if (iThread >= m_numThreads)
				{
					iThread = kFirstWorkerThreadId;  // first worker thread
				}
			}
			wakeWorkers(jobCount - 1);

			// put the main thread to work on emptying the job queue and then wait for all workers to finish
			waitJobs();

			// add up all the thread sums
			btScalar sum = btScalar(0);
			for (int iThread = 0; iThread < m_numThreads; ++iThread)
			{
				sum += m_threadLocalStorage[iThread].m_sumResult;
			}
			m_antiNestingLock.unlock();
			return sum;
		}
		else
		{
			BT_PROFILE("parallelSum_mainThread");
			// just run on main thread
			return body.sumLoop(iBegin, iEnd);
		}
	}
};

btITaskScheduler* btCreateDefaultTaskScheduler()
{
	btTaskSchedulerDefault* ts = new btTaskSchedulerDefault();
	ts->init();
	return ts;
}

#else  // #if BT_THREADSAFE

btITaskScheduler* btCreateDefaultTaskScheduler()
{
	return NULL;
}

#endif  // #else // #if BT_THREADSAFE
Embed third parties 2020-11-16 14:47:43 +08:00
			`#include "LinearMath/btMinMax.h"`
			`#include "LinearMath/btAlignedObjectArray.h"`
			`#include "LinearMath/btThreads.h"`
			`#include "LinearMath/btQuickprof.h"`
			`#include <stdio.h>`
			`#include <algorithm>`

			`#if BT_THREADSAFE`

			`#include "btThreadSupportInterface.h"`

			`#if defined(_WIN32)`

			`#define WIN32_LEAN_AND_MEAN`

			`#include <windows.h>`

			`#endif`

			`typedef unsigned long long btU64;`
			`static const int kCacheLineSize = 64;`

			`void btSpinPause()`
			`{`
			`#if defined(_WIN32)`
			`YieldProcessor();`
			`#endif`
			`}`

			`struct WorkerThreadStatus`
			`{`
			`enum Type`
			`{`
			`kInvalid,`
			`kWaitingForWork,`
			`kWorking,`
			`kSleeping,`
			`};`
			`};`

			`ATTRIBUTE_ALIGNED64(class)`
			`WorkerThreadDirectives`
			`{`
			`static const int kMaxThreadCount = BT_MAX_THREAD_COUNT;`
			`// directives for all worker threads packed into a single cacheline`
			`char m_threadDirs[kMaxThreadCount];`

			`public:`
			`enum Type`
			`{`
			`kInvalid,`
			`kGoToSleep, // go to sleep`
			`kStayAwakeButIdle, // wait for not checking job queue`
			`kScanForJobs, // actively scan job queue for jobs`
			`};`
			`WorkerThreadDirectives()`
			`{`
			`for (int i = 0; i < kMaxThreadCount; ++i)`
			`{`
			`m_threadDirs[i] = 0;`
			`}`
			`}`

			`Type getDirective(int threadId)`
			`{`
			`btAssert(threadId < kMaxThreadCount);`
			`return static_cast<Type>(m_threadDirs[threadId]);`
			`}`

			`void setDirectiveByRange(int threadBegin, int threadEnd, Type dir)`
			`{`
			`btAssert(threadBegin < threadEnd);`
			`btAssert(threadEnd <= kMaxThreadCount);`
			`char dirChar = static_cast<char>(dir);`
			`for (int i = threadBegin; i < threadEnd; ++i)`
			`{`
			`m_threadDirs[i] = dirChar;`
			`}`
			`}`
			`};`

			`class JobQueue;`

			`ATTRIBUTE_ALIGNED64(struct)`
			`ThreadLocalStorage`
			`{`
			`int m_threadId;`
			`WorkerThreadStatus::Type m_status;`
			`int m_numJobsFinished;`
			`btSpinMutex m_mutex;`
			`btScalar m_sumResult;`
			`WorkerThreadDirectives* m_directive;`
			`JobQueue* m_queue;`
			`btClock* m_clock;`
			`unsigned int m_cooldownTime;`
			`};`

			`struct IJob`
			`{`
			`virtual void executeJob(int threadId) = 0;`
			`};`

			`class ParallelForJob : public IJob`
			`{`
			`const btIParallelForBody* m_body;`
			`int m_begin;`
			`int m_end;`

			`public:`
			`ParallelForJob(int iBegin, int iEnd, const btIParallelForBody& body)`
			`{`
			`m_body = &body;`
			`m_begin = iBegin;`
			`m_end = iEnd;`
			`}`
			`virtual void executeJob(int threadId) BT_OVERRIDE`
			`{`
			`BT_PROFILE("executeJob");`

			`// call the functor body to do the work`
			`m_body->forLoop(m_begin, m_end);`
			`}`
			`};`

			`class ParallelSumJob : public IJob`
			`{`
			`const btIParallelSumBody* m_body;`
			`ThreadLocalStorage* m_threadLocalStoreArray;`
			`int m_begin;`
			`int m_end;`

			`public:`
			`ParallelSumJob(int iBegin, int iEnd, const btIParallelSumBody& body, ThreadLocalStorage* tls)`
			`{`
			`m_body = &body;`
			`m_threadLocalStoreArray = tls;`
			`m_begin = iBegin;`
			`m_end = iEnd;`
			`}`
			`virtual void executeJob(int threadId) BT_OVERRIDE`
			`{`
			`BT_PROFILE("executeJob");`

			`// call the functor body to do the work`
			`btScalar val = m_body->sumLoop(m_begin, m_end);`
			`#if BT_PARALLEL_SUM_DETERMINISTISM`
			`// by truncating bits of the result, we can make the parallelSum deterministic (at the expense of precision)`
			`const float TRUNC_SCALE = float(1 << 19);`
			`val = floor(val * TRUNC_SCALE + 0.5f) / TRUNC_SCALE; // truncate some bits`
			`#endif`
			`m_threadLocalStoreArray[threadId].m_sumResult += val;`
			`}`
			`};`

			`ATTRIBUTE_ALIGNED64(class)`
			`JobQueue`
			`{`
			`btThreadSupportInterface* m_threadSupport;`
			`btCriticalSection* m_queueLock;`
			`btSpinMutex m_mutex;`

			`btAlignedObjectArray<IJob*> m_jobQueue;`
			`char* m_jobMem;`
			`int m_jobMemSize;`
			`bool m_queueIsEmpty;`
			`int m_tailIndex;`
			`int m_headIndex;`
			`int m_allocSize;`
			`bool m_useSpinMutex;`
			`btAlignedObjectArray<JobQueue*> m_neighborContexts;`
			`char m_cachePadding[kCacheLineSize]; // prevent false sharing`

			`void freeJobMem()`
			`{`
			`if (m_jobMem)`
			`{`
			`// free old`
			`btAlignedFree(m_jobMem);`
			`m_jobMem = NULL;`
			`}`
			`}`
			`void resizeJobMem(int newSize)`
			`{`
			`if (newSize > m_jobMemSize)`
			`{`
			`freeJobMem();`
			`m_jobMem = static_cast<char*>(btAlignedAlloc(newSize, kCacheLineSize));`
			`m_jobMemSize = newSize;`
			`}`
			`}`

			`public:`
			`JobQueue()`
			`{`
			`m_jobMem = NULL;`
			`m_jobMemSize = 0;`
			`m_threadSupport = NULL;`
			`m_queueLock = NULL;`
			`m_headIndex = 0;`
			`m_tailIndex = 0;`
			`m_useSpinMutex = false;`
			`}`
			`~JobQueue()`
			`{`
			`exit();`
			`}`
			`void exit()`
			`{`
			`freeJobMem();`
			`if (m_queueLock && m_threadSupport)`
			`{`
			`m_threadSupport->deleteCriticalSection(m_queueLock);`
			`m_queueLock = NULL;`
			`m_threadSupport = 0;`
			`}`
			`}`

			`void init(btThreadSupportInterface * threadSup, btAlignedObjectArray<JobQueue> * contextArray)`
			`{`
			`m_threadSupport = threadSup;`
			`if (threadSup)`
			`{`
			`m_queueLock = m_threadSupport->createCriticalSection();`
			`}`
			`setupJobStealing(contextArray, contextArray->size());`
			`}`
			`void setupJobStealing(btAlignedObjectArray<JobQueue> * contextArray, int numActiveContexts)`
			`{`
			`btAlignedObjectArray<JobQueue>& contexts = *contextArray;`
			`int selfIndex = 0;`
			`for (int i = 0; i < contexts.size(); ++i)`
			`{`
			`if (this == &contexts[i])`
			`{`
			`selfIndex = i;`
			`break;`
			`}`
			`}`
			`int numNeighbors = btMin(2, contexts.size() - 1);`
			`int neighborOffsets[] = {-1, 1, -2, 2, -3, 3};`
			`int numOffsets = sizeof(neighborOffsets) / sizeof(neighborOffsets[0]);`
			`m_neighborContexts.reserve(numNeighbors);`
			`m_neighborContexts.resizeNoInitialize(0);`
			`for (int i = 0; i < numOffsets && m_neighborContexts.size() < numNeighbors; i++)`
			`{`
			`int neighborIndex = selfIndex + neighborOffsets[i];`
			`if (neighborIndex >= 0 && neighborIndex < numActiveContexts)`
			`{`
			`m_neighborContexts.push_back(&contexts[neighborIndex]);`
			`}`
			`}`
			`}`

			`bool isQueueEmpty() const { return m_queueIsEmpty; }`
			`void lockQueue()`
			`{`
			`if (m_useSpinMutex)`
			`{`
			`m_mutex.lock();`
			`}`
			`else`
			`{`
			`m_queueLock->lock();`
			`}`
			`}`
			`void unlockQueue()`
			`{`
			`if (m_useSpinMutex)`
			`{`
			`m_mutex.unlock();`
			`}`
			`else`
			`{`
			`m_queueLock->unlock();`
			`}`
			`}`
			`void clearQueue(int jobCount, int jobSize)`
			`{`
			`lockQueue();`
			`m_headIndex = 0;`
			`m_tailIndex = 0;`
			`m_allocSize = 0;`
			`m_queueIsEmpty = true;`
			`int jobBufSize = jobSize * jobCount;`
			`// make sure we have enough memory allocated to store jobs`
			`if (jobBufSize > m_jobMemSize)`
			`{`
			`resizeJobMem(jobBufSize);`
			`}`
			`// make sure job queue is big enough`
			`if (jobCount > m_jobQueue.capacity())`
			`{`
			`m_jobQueue.reserve(jobCount);`
			`}`
			`unlockQueue();`
			`m_jobQueue.resizeNoInitialize(0);`
			`}`
			`void* allocJobMem(int jobSize)`
			`{`
			`btAssert(m_jobMemSize >= (m_allocSize + jobSize));`
			`void* jobMem = &m_jobMem[m_allocSize];`
			`m_allocSize += jobSize;`
			`return jobMem;`
			`}`
			`void submitJob(IJob * job)`
			`{`
			`btAssert(reinterpret_cast<char>(job) >= &m_jobMem[0] && reinterpret_cast<char>(job) < &m_jobMem[0] + m_allocSize);`
			`m_jobQueue.push_back(job);`
			`lockQueue();`
			`m_tailIndex++;`
			`m_queueIsEmpty = false;`
			`unlockQueue();`
			`}`
			`IJob* consumeJobFromOwnQueue()`
			`{`
			`if (m_queueIsEmpty)`
			`{`
			`// lock free path. even if this is taken erroneously it isn't harmful`
			`return NULL;`
			`}`
			`IJob* job = NULL;`
			`lockQueue();`
			`if (!m_queueIsEmpty)`
			`{`
			`job = m_jobQueue[m_headIndex++];`
			`btAssert(reinterpret_cast<char>(job) >= &m_jobMem[0] && reinterpret_cast<char>(job) < &m_jobMem[0] + m_allocSize);`
			`if (m_headIndex == m_tailIndex)`
			`{`
			`m_queueIsEmpty = true;`
			`}`
			`}`
			`unlockQueue();`
			`return job;`
			`}`
			`IJob* consumeJob()`
			`{`
			`if (IJob* job = consumeJobFromOwnQueue())`
			`{`
			`return job;`
			`}`
			`// own queue is empty, try to steal from neighbor`
			`for (int i = 0; i < m_neighborContexts.size(); ++i)`
			`{`
			`JobQueue* otherContext = m_neighborContexts[i];`
			`if (IJob* job = otherContext->consumeJobFromOwnQueue())`
			`{`
			`return job;`
			`}`
			`}`
			`return NULL;`
			`}`
			`};`

			`static void WorkerThreadFunc(void* userPtr)`
			`{`
			`BT_PROFILE("WorkerThreadFunc");`
			`ThreadLocalStorage* localStorage = (ThreadLocalStorage*)userPtr;`
			`JobQueue* jobQueue = localStorage->m_queue;`

			`bool shouldSleep = false;`
			`int threadId = localStorage->m_threadId;`
			`while (!shouldSleep)`
			`{`
			`// do work`
			`localStorage->m_mutex.lock();`
			`while (IJob* job = jobQueue->consumeJob())`
			`{`
			`localStorage->m_status = WorkerThreadStatus::kWorking;`
			`job->executeJob(threadId);`
			`localStorage->m_numJobsFinished++;`
			`}`
			`localStorage->m_status = WorkerThreadStatus::kWaitingForWork;`
			`localStorage->m_mutex.unlock();`
			`btU64 clockStart = localStorage->m_clock->getTimeMicroseconds();`
			`// while queue is empty,`
			`while (jobQueue->isQueueEmpty())`
			`{`
			`// todo: spin wait a bit to avoid hammering the empty queue`
			`btSpinPause();`
			`if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kGoToSleep)`
			`{`
			`shouldSleep = true;`
			`break;`
			`}`
			`// if jobs are incoming,`
			`if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kScanForJobs)`
			`{`
			`clockStart = localStorage->m_clock->getTimeMicroseconds(); // reset clock`
			`}`
			`else`
			`{`
			`for (int i = 0; i < 50; ++i)`
			`{`
			`btSpinPause();`
			`btSpinPause();`
			`btSpinPause();`
			`btSpinPause();`
			`if (localStorage->m_directive->getDirective(threadId) == WorkerThreadDirectives::kScanForJobs \|\| !jobQueue->isQueueEmpty())`
			`{`
			`break;`
			`}`
			`}`
			`// if no jobs incoming and queue has been empty for the cooldown time, sleep`
			`btU64 timeElapsed = localStorage->m_clock->getTimeMicroseconds() - clockStart;`
			`if (timeElapsed > localStorage->m_cooldownTime)`
			`{`
			`shouldSleep = true;`
			`break;`
			`}`
			`}`
			`}`
			`}`
			`{`
			`BT_PROFILE("sleep");`
			`// go sleep`
			`localStorage->m_mutex.lock();`
			`localStorage->m_status = WorkerThreadStatus::kSleeping;`
			`localStorage->m_mutex.unlock();`
			`}`
			`}`

			`class btTaskSchedulerDefault : public btITaskScheduler`
			`{`
			`btThreadSupportInterface* m_threadSupport;`
			`WorkerThreadDirectives* m_workerDirective;`
			`btAlignedObjectArray<JobQueue> m_jobQueues;`
			`btAlignedObjectArray<JobQueue*> m_perThreadJobQueues;`
			`btAlignedObjectArray<ThreadLocalStorage> m_threadLocalStorage;`
			`btSpinMutex m_antiNestingLock; // prevent nested parallel-for`
			`btClock m_clock;`
			`int m_numThreads;`
			`int m_numWorkerThreads;`
			`int m_numActiveJobQueues;`
			`int m_maxNumThreads;`
			`int m_numJobs;`
			`static const int kFirstWorkerThreadId = 1;`

			`public:`
			`btTaskSchedulerDefault() : btITaskScheduler("ThreadSupport")`
			`{`
			`m_threadSupport = NULL;`
			`m_workerDirective = NULL;`
			`}`

			`virtual ~btTaskSchedulerDefault()`
			`{`
			`waitForWorkersToSleep();`

			`for (int i = 0; i < m_jobQueues.size(); ++i)`
			`{`
			`m_jobQueues[i].exit();`
			`}`

			`if (m_threadSupport)`
			`{`
			`delete m_threadSupport;`
			`m_threadSupport = NULL;`
			`}`
			`if (m_workerDirective)`
			`{`
			`btAlignedFree(m_workerDirective);`
			`m_workerDirective = NULL;`
			`}`
			`}`

			`void init()`
			`{`
			`btThreadSupportInterface::ConstructionInfo constructionInfo("TaskScheduler", WorkerThreadFunc);`
			`m_threadSupport = btThreadSupportInterface::create(constructionInfo);`
			`m_workerDirective = static_cast<WorkerThreadDirectives>(btAlignedAlloc(sizeof(m_workerDirective), 64));`

			`m_numWorkerThreads = m_threadSupport->getNumWorkerThreads();`
			`m_maxNumThreads = m_threadSupport->getNumWorkerThreads() + 1;`
			`m_numThreads = m_maxNumThreads;`
			`// ideal to have one job queue for each physical processor (except for the main thread which needs no queue)`
			`int numThreadsPerQueue = m_threadSupport->getLogicalToPhysicalCoreRatio();`
			`int numJobQueues = (numThreadsPerQueue == 1) ? (m_maxNumThreads - 1) : (m_maxNumThreads / numThreadsPerQueue);`
			`m_jobQueues.resize(numJobQueues);`
			`m_numActiveJobQueues = numJobQueues;`
			`for (int i = 0; i < m_jobQueues.size(); ++i)`
			`{`
			`m_jobQueues[i].init(m_threadSupport, &m_jobQueues);`
			`}`
			`m_perThreadJobQueues.resize(m_numThreads);`
			`for (int i = 0; i < m_numThreads; i++)`
			`{`
			`JobQueue* jq = NULL;`
			`// only worker threads get a job queue`
			`if (i > 0)`
			`{`
			`if (numThreadsPerQueue == 1)`
			`{`
			`// one queue per worker thread`
			`jq = &m_jobQueues[i - kFirstWorkerThreadId];`
			`}`
			`else`
			`{`
			`// 2 threads share each queue`
			`jq = &m_jobQueues[i / numThreadsPerQueue];`
			`}`
			`}`
			`m_perThreadJobQueues[i] = jq;`
			`}`
			`m_threadLocalStorage.resize(m_numThreads);`
			`for (int i = 0; i < m_numThreads; i++)`
			`{`
			`ThreadLocalStorage& storage = m_threadLocalStorage[i];`
			`storage.m_threadId = i;`
			`storage.m_directive = m_workerDirective;`
			`storage.m_status = WorkerThreadStatus::kSleeping;`
			`storage.m_cooldownTime = 100; // 100 microseconds, threads go to sleep after this long if they have nothing to do`
			`storage.m_clock = &m_clock;`
			`storage.m_queue = m_perThreadJobQueues[i];`
			`}`
			`setWorkerDirectives(WorkerThreadDirectives::kGoToSleep); // no work for them yet`
			`setNumThreads(m_threadSupport->getCacheFriendlyNumThreads());`
			`}`

			`void setWorkerDirectives(WorkerThreadDirectives::Type dir)`
			`{`
			`m_workerDirective->setDirectiveByRange(kFirstWorkerThreadId, m_numThreads, dir);`
			`}`

			`virtual int getMaxNumThreads() const BT_OVERRIDE`
			`{`
			`return m_maxNumThreads;`
			`}`

			`virtual int getNumThreads() const BT_OVERRIDE`
			`{`
			`return m_numThreads;`
			`}`

			`virtual void setNumThreads(int numThreads) BT_OVERRIDE`
			`{`
			`m_numThreads = btMax(btMin(numThreads, int(m_maxNumThreads)), 1);`
			`m_numWorkerThreads = m_numThreads - 1;`
			`m_numActiveJobQueues = 0;`
			`// if there is at least 1 worker,`
			`if (m_numWorkerThreads > 0)`
			`{`
			`// re-setup job stealing between queues to avoid attempting to steal from an inactive job queue`
			`JobQueue* lastActiveContext = m_perThreadJobQueues[m_numThreads - 1];`
			`int iLastActiveContext = lastActiveContext - &m_jobQueues[0];`
			`m_numActiveJobQueues = iLastActiveContext + 1;`
			`for (int i = 0; i < m_jobQueues.size(); ++i)`
			`{`
			`m_jobQueues[i].setupJobStealing(&m_jobQueues, m_numActiveJobQueues);`
			`}`
			`}`
			`m_workerDirective->setDirectiveByRange(m_numThreads, BT_MAX_THREAD_COUNT, WorkerThreadDirectives::kGoToSleep);`
			`}`

			`void waitJobs()`
			`{`
			`BT_PROFILE("waitJobs");`
			`// have the main thread work until the job queues are empty`
			`int numMainThreadJobsFinished = 0;`
			`for (int i = 0; i < m_numActiveJobQueues; ++i)`
			`{`
			`while (IJob* job = m_jobQueues[i].consumeJob())`
			`{`
			`job->executeJob(0);`
			`numMainThreadJobsFinished++;`
			`}`
			`}`

			`// done with jobs for now, tell workers to rest (but not sleep)`
			`setWorkerDirectives(WorkerThreadDirectives::kStayAwakeButIdle);`

			`btU64 clockStart = m_clock.getTimeMicroseconds();`
			`// wait for workers to finish any jobs in progress`
			`while (true)`
			`{`
			`int numWorkerJobsFinished = 0;`
			`for (int iThread = kFirstWorkerThreadId; iThread < m_numThreads; ++iThread)`
			`{`
			`ThreadLocalStorage* storage = &m_threadLocalStorage[iThread];`
			`storage->m_mutex.lock();`
			`numWorkerJobsFinished += storage->m_numJobsFinished;`
			`storage->m_mutex.unlock();`
			`}`
			`if (numWorkerJobsFinished + numMainThreadJobsFinished == m_numJobs)`
			`{`
			`break;`
			`}`
			`btU64 timeElapsed = m_clock.getTimeMicroseconds() - clockStart;`
			`btAssert(timeElapsed < 1000);`
			`if (timeElapsed > 100000)`
			`{`
			`break;`
			`}`
			`btSpinPause();`
			`}`
			`}`

			`void wakeWorkers(int numWorkersToWake)`
			`{`
			`BT_PROFILE("wakeWorkers");`
			`btAssert(m_workerDirective->getDirective(1) == WorkerThreadDirectives::kScanForJobs);`
			`int numDesiredWorkers = btMin(numWorkersToWake, m_numWorkerThreads);`
			`int numActiveWorkers = 0;`
			`for (int iWorker = 0; iWorker < m_numWorkerThreads; ++iWorker)`
			`{`
			`// note this count of active workers is not necessarily totally reliable, because a worker thread could be`
			`// just about to put itself to sleep. So we may on occasion fail to wake up all the workers. It should be rare.`
			`ThreadLocalStorage& storage = m_threadLocalStorage[kFirstWorkerThreadId + iWorker];`
			`if (storage.m_status != WorkerThreadStatus::kSleeping)`
			`{`
			`numActiveWorkers++;`
			`}`
			`}`
			`for (int iWorker = 0; iWorker < m_numWorkerThreads && numActiveWorkers < numDesiredWorkers; ++iWorker)`
			`{`
			`ThreadLocalStorage& storage = m_threadLocalStorage[kFirstWorkerThreadId + iWorker];`
			`if (storage.m_status == WorkerThreadStatus::kSleeping)`
			`{`
			`m_threadSupport->runTask(iWorker, &storage);`
			`numActiveWorkers++;`
			`}`
			`}`
			`}`

			`void waitForWorkersToSleep()`
			`{`
			`BT_PROFILE("waitForWorkersToSleep");`
			`setWorkerDirectives(WorkerThreadDirectives::kGoToSleep);`
			`m_threadSupport->waitForAllTasks();`
			`for (int i = kFirstWorkerThreadId; i < m_numThreads; i++)`
			`{`
			`ThreadLocalStorage& storage = m_threadLocalStorage[i];`
			`btAssert(storage.m_status == WorkerThreadStatus::kSleeping);`
			`}`
			`}`

			`virtual void sleepWorkerThreadsHint() BT_OVERRIDE`
			`{`
			`BT_PROFILE("sleepWorkerThreadsHint");`
			`// hint the task scheduler that we may not be using these threads for a little while`
			`setWorkerDirectives(WorkerThreadDirectives::kGoToSleep);`
			`}`

			`void prepareWorkerThreads()`
			`{`
			`for (int i = kFirstWorkerThreadId; i < m_numThreads; ++i)`
			`{`
			`ThreadLocalStorage& storage = m_threadLocalStorage[i];`
			`storage.m_mutex.lock();`
			`storage.m_numJobsFinished = 0;`
			`storage.m_mutex.unlock();`
			`}`
			`setWorkerDirectives(WorkerThreadDirectives::kScanForJobs);`
			`}`

			`virtual void parallelFor(int iBegin, int iEnd, int grainSize, const btIParallelForBody& body) BT_OVERRIDE`
			`{`
			`BT_PROFILE("parallelFor_ThreadSupport");`
			`btAssert(iEnd >= iBegin);`
			`btAssert(grainSize >= 1);`
			`int iterationCount = iEnd - iBegin;`
			`if (iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock())`
			`{`
			`typedef ParallelForJob JobType;`
			`int jobCount = (iterationCount + grainSize - 1) / grainSize;`
			`m_numJobs = jobCount;`
			`btAssert(jobCount >= 2); // need more than one job for multithreading`
			`int jobSize = sizeof(JobType);`

			`for (int i = 0; i < m_numActiveJobQueues; ++i)`
			`{`
			`m_jobQueues[i].clearQueue(jobCount, jobSize);`
			`}`
			`// prepare worker threads for incoming work`
			`prepareWorkerThreads();`
			`// submit all of the jobs`
			`int iJob = 0;`
			`int iThread = kFirstWorkerThreadId; // first worker thread`
			`for (int i = iBegin; i < iEnd; i += grainSize)`
			`{`
			`btAssert(iJob < jobCount);`
			`int iE = btMin(i + grainSize, iEnd);`
			`JobQueue* jq = m_perThreadJobQueues[iThread];`
			`btAssert(jq);`
			`btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues);`
			`void* jobMem = jq->allocJobMem(jobSize);`
			`JobType* job = new (jobMem) ParallelForJob(i, iE, body); // placement new`
			`jq->submitJob(job);`
			`iJob++;`
			`iThread++;`
			`if (iThread >= m_numThreads)`
			`{`
			`iThread = kFirstWorkerThreadId; // first worker thread`
			`}`
			`}`
			`wakeWorkers(jobCount - 1);`

			`// put the main thread to work on emptying the job queue and then wait for all workers to finish`
			`waitJobs();`
			`m_antiNestingLock.unlock();`
			`}`
			`else`
			`{`
			`BT_PROFILE("parallelFor_mainThread");`
			`// just run on main thread`
			`body.forLoop(iBegin, iEnd);`
			`}`
			`}`
			`virtual btScalar parallelSum(int iBegin, int iEnd, int grainSize, const btIParallelSumBody& body) BT_OVERRIDE`
			`{`
			`BT_PROFILE("parallelSum_ThreadSupport");`
			`btAssert(iEnd >= iBegin);`
			`btAssert(grainSize >= 1);`
			`int iterationCount = iEnd - iBegin;`
			`if (iterationCount > grainSize && m_numWorkerThreads > 0 && m_antiNestingLock.tryLock())`
			`{`
			`typedef ParallelSumJob JobType;`
			`int jobCount = (iterationCount + grainSize - 1) / grainSize;`
			`m_numJobs = jobCount;`
			`btAssert(jobCount >= 2); // need more than one job for multithreading`
			`int jobSize = sizeof(JobType);`
			`for (int i = 0; i < m_numActiveJobQueues; ++i)`
			`{`
			`m_jobQueues[i].clearQueue(jobCount, jobSize);`
			`}`

			`// initialize summation`
			`for (int iThread = 0; iThread < m_numThreads; ++iThread)`
			`{`
			`m_threadLocalStorage[iThread].m_sumResult = btScalar(0);`
			`}`

			`// prepare worker threads for incoming work`
			`prepareWorkerThreads();`
			`// submit all of the jobs`
			`int iJob = 0;`
			`int iThread = kFirstWorkerThreadId; // first worker thread`
			`for (int i = iBegin; i < iEnd; i += grainSize)`
			`{`
			`btAssert(iJob < jobCount);`
			`int iE = btMin(i + grainSize, iEnd);`
			`JobQueue* jq = m_perThreadJobQueues[iThread];`
			`btAssert(jq);`
			`btAssert((jq - &m_jobQueues[0]) < m_numActiveJobQueues);`
			`void* jobMem = jq->allocJobMem(jobSize);`
			`JobType* job = new (jobMem) ParallelSumJob(i, iE, body, &m_threadLocalStorage[0]); // placement new`
			`jq->submitJob(job);`
			`iJob++;`
			`iThread++;`
			`if (iThread >= m_numThreads)`
			`{`
			`iThread = kFirstWorkerThreadId; // first worker thread`
			`}`
			`}`
			`wakeWorkers(jobCount - 1);`

			`// put the main thread to work on emptying the job queue and then wait for all workers to finish`
			`waitJobs();`

			`// add up all the thread sums`
			`btScalar sum = btScalar(0);`
			`for (int iThread = 0; iThread < m_numThreads; ++iThread)`
			`{`
			`sum += m_threadLocalStorage[iThread].m_sumResult;`
			`}`
			`m_antiNestingLock.unlock();`
			`return sum;`
			`}`
			`else`
			`{`
			`BT_PROFILE("parallelSum_mainThread");`
			`// just run on main thread`
			`return body.sumLoop(iBegin, iEnd);`
			`}`
			`}`
			`};`

			`btITaskScheduler* btCreateDefaultTaskScheduler()`
			`{`
			`btTaskSchedulerDefault* ts = new btTaskSchedulerDefault();`
			`ts->init();`
			`return ts;`
			`}`

			`#else // #if BT_THREADSAFE`

			`btITaskScheduler* btCreateDefaultTaskScheduler()`
			`{`
			`return NULL;`
			`}`

			`#endif // #else // #if BT_THREADSAFE`