Blob Blame History Raw
///////////////////////////////////////////////////////////////////////////
//
// Copyright (c) 2009-2014 DreamWorks Animation LLC. 
//
// All rights reserved.
//
// Redistribution and use in source and binary forms, with or without
// modification, are permitted provided that the following conditions are
// met:
// *       Redistributions of source code must retain the above copyright
// notice, this list of conditions and the following disclaimer.
// *       Redistributions in binary form must reproduce the above
// copyright notice, this list of conditions and the following disclaimer
// in the documentation and/or other materials provided with the
// distribution.
// *       Neither the name of DreamWorks Animation nor the names of
// its contributors may be used to endorse or promote products derived
// from this software without specific prior written permission.
//
// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
//
///////////////////////////////////////////////////////////////////////////

//
// A program to generate various acceleration lookup tables 
// for Imf::DwaCompressor
//

#include <cstddef>
#include <stdio.h>
#include <stdlib.h>
#include <math.h>
#include <vector>

#include <OpenEXRConfig.h>

#ifdef OPENEXR_IMF_HAVE_SYSCONF_NPROCESSORS_ONLN
#include <unistd.h>
#endif

#include <half.h>
#include <IlmThread.h>
#include <IlmThreadSemaphore.h>
#include <ImfIO.h>
#include <ImfXdr.h>
#include "ImfNamespace.h"

using namespace OPENEXR_IMF_NAMESPACE;

namespace {

    class LutHeaderWorker
    {
        public:
            class Runner : public IlmThread::Thread
            {
                public:
                    Runner(LutHeaderWorker &worker, bool output):
                        IlmThread::Thread(),
                        _worker(worker),
                        _output(output)
                    {
                        start();
                    }

                    virtual ~Runner()
                    {
                        _semaphore.wait();
                    }

                    virtual void run()
                    {
                        _semaphore.post();
                        _worker.run(_output);
                    }

                private:
                    LutHeaderWorker     &_worker;
                    bool                 _output;
                    IlmThread::Semaphore _semaphore;

            }; // class LutHeaderWorker::Runner


            LutHeaderWorker(size_t startValue,
                            size_t endValue):
                _lastCandidateCount(0),
                _startValue(startValue),
                _endValue(endValue),
                _numElements(0),
                _offset(new size_t[numValues()]),
                _elements(new unsigned short[1024*1024*2])
            {
            }

            ~LutHeaderWorker()
            {
                delete[] _offset;
                delete[] _elements;
            }

            size_t lastCandidateCount() const
            {
                return _lastCandidateCount;
            }

            size_t numValues() const 
            {
                return _endValue - _startValue;
            }

            size_t numElements() const
            {
                return _numElements;
            }

            const size_t* offset() const
            {
                return _offset;
            }

            const unsigned short* elements() const
            {
                return _elements;
            }

            void run(bool outputProgress)
            {
                half candidate[16];
                int  candidateCount = 0;

                for (size_t input=_startValue; input<_endValue; ++input) {

                    if (outputProgress) {
#ifdef __GNUC__
                        if (input % 100 == 0) {
                            fprintf(stderr, 
                            " Building acceleration for DwaCompressor, %.2f %%      %c",
                                          100.*(float)input/(float)numValues(), 13);
                        }
#else
                        if (input % 1000 == 0) {
                            fprintf(stderr, 
                            " Building acceleration for DwaCompressor, %.2f %%\n",
                                          100.*(float)input/(float)numValues());
                        }
#endif
                    } 

                    
                    int  numSetBits = countSetBits(input);
                    half inputHalf, closestHalf;

                    inputHalf.setBits(input);

                    _offset[input - _startValue] = _numElements;

                    // Gather candidates
                    candidateCount = 0;
                    for (int targetNumSetBits=numSetBits-1; targetNumSetBits>=0;
                                                           --targetNumSetBits) {
                        bool valueFound = false;

                        for (int i=0; i<65536; ++i) {
                            if (countSetBits(i) != targetNumSetBits) continue;

                            if (!valueFound) {
                                closestHalf.setBits(i);
                                valueFound = true;
                            } else {
                                half tmpHalf;

                                tmpHalf.setBits(i);

                                if (fabs((float)inputHalf - (float)tmpHalf) < 
                                    fabs((float)inputHalf - (float)closestHalf)) {
                                    closestHalf = tmpHalf;
                                }
                            }
                        }

                        if (valueFound == false) {
                            fprintf(stderr, "bork bork bork!\n");
                        }       

                        candidate[candidateCount] = closestHalf;
                        candidateCount++;
                    }

                    // Sort candidates by increasing number of bits set
                    for (int i=0; i<candidateCount; ++i) {
                        for (int j=i+1; j<candidateCount; ++j) {

                            int   iCnt = countSetBits(candidate[i].bits());
                            int   jCnt = countSetBits(candidate[j].bits());

                            if (jCnt < iCnt) {
                                half tmp     = candidate[i];
                                candidate[i] = candidate[j];
                                candidate[j] = tmp;
                            }
                        }
                    }

                    // Copy candidates to the data buffer;
                    for (int i=0; i<candidateCount; ++i) {
                        _elements[_numElements] = candidate[i].bits();
                        _numElements++;
                    }

                    if (input == _endValue-1) {
                        _lastCandidateCount = candidateCount;
                    }
                }
            }
            

        private:
            size_t          _lastCandidateCount;
            size_t          _startValue;
            size_t          _endValue;
            size_t          _numElements;
            size_t         *_offset;
            unsigned short *_elements;

            //
            // Precomputing the bit count runs faster than using
            // the builtin instruction, at least in one case..
            //
            // Precomputing 8-bits is no slower than 16-bits,
            // and saves a fair bit of overhead..
            //
            int countSetBits(unsigned short src)
            {
                static const unsigned short numBitsSet[256] =
                {
                    0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4,
                    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
                    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
                    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
                    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
                    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
                    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
                    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
                    1, 2, 2, 3, 2, 3, 3, 4, 2, 3, 3, 4, 3, 4, 4, 5,
                    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
                    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
                    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
                    2, 3, 3, 4, 3, 4, 4, 5, 3, 4, 4, 5, 4, 5, 5, 6,
                    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
                    3, 4, 4, 5, 4, 5, 5, 6, 4, 5, 5, 6, 5, 6, 6, 7,
                    4, 5, 5, 6, 5, 6, 6, 7, 5, 6, 6, 7, 6, 7, 7, 8
                };

                return numBitsSet[src & 0xff] + numBitsSet[src >> 8];
            }

    }; // class LutHeaderWorker

} // namespace


//
// Generate a no-op LUT, to cut down in conditional branches
//
void
generateNoop()
{
    printf("const unsigned short dwaCompressorNoOp[] = \n");
    printf("{");
    for (int i=0; i<65536; ++i) {

        if (i % 8 == 0) {
            printf("\n    ");
        }

        unsigned short dst;
        char *tmp = (char *)(&dst);

        unsigned short src = (unsigned short)i;
        Xdr::write <CharPtrIO> (tmp,  src);

        printf("0x%04x, ", dst);
    }
    printf("\n};\n");
}

//
// Nonlinearly encode luminance. For values below 1.0, we want
// to use a gamma 2.2 function to match what is fairly common
// for storing output referred. However, > 1, gamma functions blow up,
// and log functions are much better behaved. We could use a log 
// function everywhere, but it tends to over-sample dark 
// regions and undersample the brighter regions, when 
// compared to the way real devices reproduce values.
//
// So, above 1, use a log function which is a smooth blend
// into the gamma function. 
//
//  Nonlinear(linear) = 
//
//    linear^(1./2.2)             / linear <= 1.0
//                               |
//    ln(linear)/ln(e^2.2) + 1    \ otherwise
//
//
// toNonlinear[] needs to take in XDR format half float values,
// and output NATIVE format float. 
//
// toLinear[] does the opposite - takes in NATIVE half and 
// outputs XDR half values.
//

void
generateToLinear()
{
    unsigned short toLinear[65536];

    toLinear[0] = 0;

    for (int i=1; i<65536; ++i) {
        half  h;
        float sign    = 1;
        float logBase = pow(2.7182818, 2.2);

        // map  NaN and inf to 0
        if ((i & 0x7c00) == 0x7c00) {
            toLinear[i]    = 0;
            continue;
        }

        //
        // _toLinear - assume i is NATIVE, but our output needs
        //             to get flipped to XDR
        //
        h.setBits(i);
        sign = 1;
        if ((float)h < 0) {
            sign = -1;
        } 

        if ( fabs( (float)h) <= 1.0 ) {
            h  = (half)(sign * pow((float)fabs((float)h), 2.2f));
        } else {
            h  = (half)(sign * pow(logBase, (float)(fabs((float)h) - 1.0)));
        }

        {
            char *tmp = (char *)(&toLinear[i]);

            Xdr::write <CharPtrIO> ( tmp,  h.bits());
        }
    }
    
    printf("const unsigned short dwaCompressorToLinear[] = \n");
    printf("{");
    for (int i=0; i<65536; ++i) {
        if (i % 8 == 0) {
            printf("\n    ");
        }
        printf("0x%04x, ", toLinear[i]);
    }
    printf("\n};\n");
}


void
generateToNonlinear()
{
    unsigned short toNonlinear[65536];

    toNonlinear[0] = 0;

    for (int i=1; i<65536; ++i) {
        unsigned short usNative, usXdr;
        half  h;
        float sign    = 1;
        float logBase = pow(2.7182818, 2.2);

        usXdr           = i;

        {
            const char *tmp = (char *)(&usXdr);

            Xdr::read<CharPtrIO>(tmp, usNative);
        }

        // map  NaN and inf to 0
        if ((usNative & 0x7c00) == 0x7c00) {
            toNonlinear[i] = 0;
            continue;
        }

        //
        // toNonlinear - assume i is XDR
        //
        h.setBits(usNative);
        sign = 1;
        if ((float)h < 0) {
            sign = -1;
        } 

        if ( fabs( (float)h ) <= 1.0) {
            h = (half)(sign * pow(fabs((float)h), 1.f/2.2f));
        } else {
            h = (half)(sign * ( log(fabs((float)h)) / log(logBase) + 1.0) );
        }
        toNonlinear[i] = h.bits();
    }

    printf("const unsigned short dwaCompressorToNonlinear[] = \n");
    printf("{");
    for (int i=0; i<65536; ++i) {
        if (i % 8 == 0) {
            printf("\n    ");
        }
        printf("0x%04x, ", toNonlinear[i]);
    }
    printf("\n};\n");
}

//
// Attempt to get available CPUs in a somewhat portable way. 
//

int
cpuCount()
{
    if (!IlmThread::supportsThreads()) return 1;

    int cpuCount = 1;

#if defined (OPENEXR_IMF_HAVE_SYSCONF_NPROCESSORS_ONLN)

    cpuCount = sysconf(_SC_NPROCESSORS_ONLN);

#elif defined (_WIN32)

    SYSTEM_INFO sysinfo;
    GetSystemInfo( &sysinfo );
    cpuCount = sysinfo.dwNumberOfProcessors;

#endif

    if (cpuCount < 1) cpuCount = 1;
    return cpuCount;
}

//
// Generate acceleration luts for the quantization.
//
// For each possible input value, we want to find the closest numbers
// which have one fewer bits set than before. 
//
// This gives us num_bits(input)-1 values per input. If we alloc
// space for everything, that's like a 2MB table. We can do better
// by compressing all the values to be contigious and using offset
// pointers.
//
// After we've found the candidates with fewer bits set, sort them
// based on increasing numbers of bits set. This way, on quantize(),
// we can scan through the list and halt once we find the first
// candidate within the error range. For small values that can 
// be quantized to 0, 0 is the first value tested and the search
// can exit fairly quickly.
//

void
generateLutHeader()
{
    std::vector<LutHeaderWorker*> workers;

    size_t numWorkers     = cpuCount();
    size_t workerInterval = 65536 / numWorkers;

    for (size_t i=0; i<numWorkers; ++i) {
        if (i != numWorkers-1) {
            workers.push_back( new LutHeaderWorker( i   *workerInterval, 
                                                   (i+1)*workerInterval) );
        } else {
            workers.push_back( new LutHeaderWorker(i*workerInterval, 65536) );
        }
    }

    if (IlmThread::supportsThreads()) {
        std::vector<LutHeaderWorker::Runner*> runners;
        for (size_t i=0; i<workers.size(); ++i) {
            runners.push_back( new LutHeaderWorker::Runner(*workers[i], (i==0)) );
        }

        for (size_t i=0; i<workers.size(); ++i) {
            delete runners[i];
        }
    } else {
        for (size_t i=0; i<workers.size(); ++i) {
            workers[i]->run(i == 0);
        }
    }

    printf("static unsigned int closestDataOffset[] = {\n");
    int offsetIdx  = 0;
    int offsetPrev = 0;
    for (size_t i=0; i<workers.size(); ++i) {
        for (size_t value=0; value<workers[i]->numValues(); ++value) {
            if (offsetIdx % 8 == 0) {
                printf("    ");
            }
            printf("%6lu, ", workers[i]->offset()[value] + offsetPrev);
            if (offsetIdx % 8 == 7) {
                printf("\n");
            }
            offsetIdx++;
        }
        offsetPrev += workers[i]->offset()[workers[i]->numValues()-1] + 
                      workers[i]->lastCandidateCount();
    }
    printf("};\n\n\n");


    printf("static unsigned short closestData[] = {\n");
    int elementIdx = 0;
    for (size_t i=0; i<workers.size(); ++i) {
        for (size_t element=0; element<workers[i]->numElements(); ++element) {
            if (elementIdx % 8 == 0) {
                printf("    ");
            }
            printf("%5d, ", workers[i]->elements()[element]);
            if (elementIdx % 8 == 7) {
                printf("\n");
            }
            elementIdx++;
        }    
    }
    printf("};\n\n\n");

    for (size_t i=0; i<workers.size(); ++i) {
        delete workers[i];
    }
}


int
main(int argc, char **argv)
{
    printf("#include <cstddef>\n");
    printf("\n\n\n");

    generateNoop();

    printf("\n\n\n");

    generateToLinear();

    printf("\n\n\n");

    generateToNonlinear();

    printf("\n\n\n");

    generateLutHeader();

    return 0;
}