/*  abddiffmmx, by tboult@eecs.lehigh.edu 1998-1999.   
    Feel free to use/abuse this.  
    If you learn something from it, I have received enough reward.

    This is a simple, yet realistic,  example of using MMX under gnu tools
    (e.g. for linux, freebsd, netbsd or even <shuddering> windog  )
    It is based on some of the code we do in the Lehigh Omni-directional
    Tracking system (LOTS) system (see www.eecs.lehigh.edu/~tboult/TRACK)

    The routine computes two outputs.  The first, the output image, computes
    if a the absolute value of the difference of the pixel from a background
    reference is above a per-pixel threshold.  If it is we store the absdiff,
    else we set it to zero.  Logically it looks liken
    if ( abs( inim[i] - refim[i]) <= (thresh + varim[i])  )   
        outim[i] = 0
    else outim[i] = abs( inim[i] - refim[i]) - (thresh + varim[i])  

    We also compute a per-row flag, such that if 
    row_used[i] != 0, then we know that at least one pixel in the row is
    "above" the threshold.

    In our real system the code is more complex because we handle different 
    data formats, e.g. yuv422packed,  because we only process  "cords"  in 
    each row, and because we compute a "start and end" point for the above
    threshold pixels.    (The latter significantly reduces the cost of
    additional processing.)  But these make the code much harder to follow

    
*/

void absdiffmmx( // the output's  
                 unsigned char* outim, // where we store output
		 int*        row_used, // a per-row flag if "used"
		 // inputs
		 unsigned char* inim,  // the input image
		 unsigned char* refim, // the reference image (input)
		 unsigned char* varim, // the per pixel threshold (variance) 
		 unsigned char thresh, // the global threshold
		 int rows, int cols) // obvious
{

  // example of image difference and thresholding for a BW image.
  // the images must be 8-byte aligned or performance will suffer badly
  // see comments for examples on aligning things. 
  



  // the thresholding combines a per-pixel threshold (varim) and the 
  // global  threshold (thresh)  We copy the global into an mmx register 
  // so we can easily add it in the loop
  
  double mmtemp_d;  // gets it double aligned and allocates storage
  unsigned char* mmtemp= (unsigned char*) & mmtemp_d; //ease of access

  // setup,the global threshold in a mmx register
  for (int i=0;i<8;i++) {
    mmtemp[i] = thresh;
  }


  // load the threshold in mm7.. example macro for loading just 1 register
  // note we use newline and tab so we get nice looking assembly...
#define mmx_load8byte_mm7(data)\
  __asm__ volatile("\n\tmovq %0,%%mm7\n":   "=m" (data):)
  mmx_load8byte_mm7(mmtemp_d);

  for (int i=0;i<rows;i++) 
    {
      // Make them double pointers so they index easily 
      double *ipt = (double*) &inim[i*cols];
      double *opt = (double*) &outim[i*cols];
      double *rpt = (double*) &refim [i*cols];
      double *vpt = (double*) &varim[i*cols];
      
      // Remember GNU syntax is the opposite of Microsoft..  it
      // reads    op  source, dest
      // note that the lines include "formating and comments" so the 
      // generated assembly is easier to read
      // in mm0 is the ref image,
      // in mm1 is the input 
      // in mm2 is a copy of reference (so we can do i-r and r-i)
      // in mm3 is the row used "flag"
      // in mm4 is the per-pixel threshold (variance)
      // in mm7 is the global threshold (8 copies)
      // we compute abs of i-r by computing i-r and r-i using 
      // saturation (so one of them 0) and then ORing the results 
      // note I choose not to use low-level registers in my code
      // I let the g++ optimizer choose what variables to store in what
      // registers (I control the mmx only).. 
      // note that the comments include expected pipe position on a regular 
      // mmx.. it might be different on a K6 or PII where the underlying/ 
      // hardware is a tad different.  (definitely different on K6/K7 as
      // their MMX pipes are more flexible.)  Note that two consecutive lines
      // with u means different "cycles", but a u/v pair are issued at the
      // same time.   We could hand optimize for a better pipe usage for
      // a particular arch and gain somewhere between 0-7% 

      // We could include loop-unrolling and gain a little3-5%
      // and then I would do the index registers myself..
      // and would better use the "v" pipes, starting the second set of 
      // "loads" at the instruction where we get abs diff via por.


      // mm3 is used for a flag tracking if any pixels in the row are above 
      // threshold.  not really needed but it can make further  processing
      // faster by skipping empty rows.
      __asm__ volatile("\n\tpxor %%mm3, %%mm3 \t # clear row-used flag"   : : );
      int k = cols/8; // we will be using 8 chars at a time.
      register int index=0; 

      while(index<k){ // run across the row
        __asm__ volatile
                ( // instruction             #pipe    comment                  
                "\n\t movq %1,%%mm0       \t# u  load ref  "
                "\n\t movq %2,%%mm1       \t# u  load input "
                "\n\t movq %%mm0,%%mm2    \t# v copy ref "
                "\n\t movq %3,%%mm4       \t# u  load var image data "
                "\n\t psubusb %%mm1, %%mm0\t# v  ref - inp "
                "\n\t psubusb %%mm2, %%mm1\t# u inp - ref  ?subtract stall? "
                "\n\t paddusb %%mm7,%%mm4 \t# v add base threshold to variance"
                "\n\t por %%mm1, %%mm0    \t# u get abs diff (via or) "
                "\n\t psubusb %%mm4,%%mm0 \t# u subtract with saturate thresh"
                "\n\t movq %%mm0,%0       \t# u store result, "
                "\n\t por %%mm0, %%mm3    \t# v mark row used if it was "
		: "=m" (opt[index])  // this is %0,  output
		: "m" (rpt[index]),     // and %1   reference 
		  "m" (ipt[index]),     // and %2   input
		  "m" (vpt[index])      // and %3   per-pixel-threshold
		// the "=m" implies it is output.. just "m" is input
		// see the gcc info pages for more details...
		// including clobber list is not supported,  yet 
		// : "%%mm0","%%mm1","%%mm2","%%mm3","%%mm4","%%mm7"
		);  
		index++ ;
      }
      // get the per row flag back into an 
      __asm__ volatile("\n\tmovq %%mm3,%0" : "=m" (mmtemp_d));
      
      row_used [i]=0; // could be faster by playing int games, but why bother
      for (int ii=0;ii<8;ii++) 	row_used [i] |= mmtemp[ii];
    }

  // ok, done with mmx,reset it to normal floating point usage
   __asm__("emms" : : );

}


/*
  some gnu/mmx hints.  
  First of all some of the compilers/binutils may have problems. Test with
  the above which should compile with no problems.  binutils 2.9 or higher is
  recommended.     

  Note that egcs-2.91.66 (and a few other versions do not properly handle MMX
  in the assembly pass-through.   It should replace the %%mm5 with %mm5 as it
  processes but it   does not.    (If you get error messages about "bad
  register name ('%%mm3')"  or something like it, you know you have the broken
  version). 
 The workaround is to compile to assembly (e.g. 
         g++ -O3 -S -o imgdiff-tmp.s imgdiff.cc 
  then  use an editor or sed to replace %%mm with %mm
  (e.g. 
         sed -e 's/%%mm/%mm/g' < imdiff-tmp.s >imgdiff.s
         g++ -O3 -o imgdiff.o imgdiff.s 
  )

  Note that some level of optimization may be needed (without it you may get
  error messages such as
  "fixed or forbidden register 7 (sp) was spilled for class GENERAL_REGS."


  Also note the above shows how to generate assembly so you can "check" the
  quality of your code and decide if there is more to do for optimization.
  Note the asm code passed through will show up in the  .s file between sets
  with something like.  You can also add assembly tags (.labelname)  for
  profiling and to make it easier to find your code.
#APP
	pxor %%mm3, %%mm3 	 ; clear row-used flag
#NO_APP


               *********** alignment ****************
 As mentioned,  getting alignment of data is important.  For MMX and PII, only
 double alignment is needed.  To achieve this I do something like
  
  unsigned char* real_inimg=malloc(spaceneeded + 32);
  inimg = (unsigned char *) ((unsigned long)(real_inimg + 8) & 
				(unsigned long) 0xFFFFFFF8);

  when you "free" remember to freee real_inimg, not inimg.
 

*/

	
