=begin
	$Id: mmx.rb,v 1.1 2005-10-04 02:02:14 matju Exp $

	GridFlow
	Copyright (c) 2001,2002,2003,2004 by Mathieu Bouchard

	This program is free software; you can redistribute it and/or
	modify it under the terms of the GNU General Public License
	as published by the Free Software Foundation; either version 2
	of the License, or (at your option) any later version.

	See file ../COPYING for further informations on licensing terms.

	This program is distributed in the hope that it will be useful,
	but WITHOUT ANY WARRANTY; without even the implied warranty of
	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
	GNU General Public License for more details.

	You should have received a copy of the GNU General Public License
	along with this program; if not, write to the Free Software
	Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
=end

STDOUT.reopen ARGV[0], "w"
$loader = File.open ARGV[1], "w"
$count = 0
$lines = 0

puts "; generated by/for GridFlow 0.8.0"
$loader.puts "#include \"../base/grid.h.fcs\"\nextern \"C\" {"

# this class is not really used yet (only self.make)
class AsmFunction
	def initialize(name)
		@name = name
		@label_count = 1
	end
	def self.make(name)
		puts "", "GLOBAL #{name}", "#{name}:"
		puts "push ebp", "mov ebp,esp", "push esi", "push edi"
		yield AsmFunction.new(name)
		puts "pop edi", "pop esi", "leave", "ret", ""
	end
	def make_until(*ops)
		a = @label_count
		b = @label_count+1
		@label_count+=2
		ops[-1]<<" #{@name}_#{b}"
		puts "#{@name}_#{a}: ", *ops
		yield
		puts "jmp #{@name}_#{a}"
		puts "#{@name}_#{b}:"
	end
end

$sizeof = {
	:uint8 => 1,
	:int16 => 2,
	:int32 => 4,
	:int64 => 8,
	:float32 => 4,
	:float64 => 8,
}

$accum = {
	:uint8 => "al",
	:int16 => "ax",
	:int32 => "eax",
}

$asm_type = {
	:uint8 => "byte",
	:int16 => "word",
	:int32 => "dword",
	:int64 => "qword",
}

# in the following, the opcode "_" means no such thing seems available.
# also >> for x86 ought to be shr in the uint8 case.
# btw, i got all of the MMX information from the NASM manual, Appendix B.
$opcodes = {
#                     [--GF--|--x86--|--mmx-et-al----------------------------------------]
#                     [      |       |-uint8-|-int16-|-int32-|-int64-|-float32-|-float64-]
	:add     => %w[ +      add    paddb   paddw   paddd   paddq                      ],
	:sub     => %w[ -      sub    psubb   psubw   psubd   psubq                      ],
	:and     => %w[ &      and    pand    pand    pand    pand                       ],
	:xor     => %w[ ^      xor    pxor    pxor    pxor    pxor                       ],
	:or      => %w[ |      or     por     por     por     por                        ],
#	:max     => %w[ max    _      pmaxub  pmaxsw  _       _                          ], # not plain MMX !!! (req.Katmai)
#	:min     => %w[ min    _      pminub  pminsw  _       _                          ], # not plain MMX !!! (req.Katmai)
#	:eq      => %w[ ==     _      pcmpeqb pcmpeqw pcmpeqd _                          ],
#	:gt      => %w[ >      _      pcmpgtb pcmpgtw pcmpgtd _                          ],
#	:shl     => %w[ <<     shl    _       psllw   pslld   psllq                      ], # noncommutative
#	:shr     => %w[ >>     sar    _       psraw   psrad   _                          ], # noncommutative
#	:clipadd => %w[ clip+  _      paddusb paddsw  _       _                          ], # future use
#	:clipsub => %w[ clip-  _      psubusb psubsw  _       _                          ], # future use
#	:andnot  => %w[ &not   _      pandn   pandn   pandn   pandn                      ], # not planned
}

$opcodes.each {|k,op|
	op.map! {|x| if x=="_" then nil else x end }
	STDERR.puts op.inspect
}

$decls = ""
$install = ""

def make_fun_map(op,type)
	s="mmx_#{type}_map_#{op}"
	size = $sizeof[type]
	accum = $accum[type]
	sym = $opcodes[op][0]
	opcode = $opcodes[op][1]
	mopcode = $opcodes[op][size+(size<4 ? 1 : 0)]
	return if not mopcode
	AsmFunction.make(s) {|a|
		puts "mov ecx,[ebp+8]", "mov esi,[ebp+12]", "mov eax,[ebp+16]"
		puts "mov dx,ax", "shl eax,8", "mov al,dl" if size==1
		puts "mov edx,eax", "shl eax,16", "mov ax,dx" if size<=2
		puts "push eax", "push eax", "movq mm7,[esp]", "add esp,8"
		foo = proc {|n|
			a.make_until("cmp ecx,#{8/size*n}","jb near") {
				0.step(n,4) {|k|
				nn=[n-k,4].min
				o=(0..3).map{|x| 8*(x+k) }
				for i in 0...nn do puts "movq mm#{i},[esi+#{o[i]}]" end
				for i in 0...nn do puts "#{mopcode} mm#{i},mm7" end
				for i in 0...nn do puts "movq [esi+#{o[i]}],mm#{i}" end
				}
				puts "lea esi,[esi+#{8*n}]", "lea ecx,[ecx-#{8/size*n}]"
			}
		}
		foo.call 4
		foo.call 1
		a.make_until("test ecx,ecx", "jz") {
			puts "#{opcode} #{$asm_type[type]} [esi],#{accum}", "lea esi,[esi+#{size}]"
			puts "dec ecx"
		}
		puts "emms"
	}
	$decls << "void #{s}(int,#{type}*,#{type});\n"
	$install << "FIX2PTR(Numop,rb_hash_aref(op_dict,SYM(#{sym})))"
	$install << "->on_#{type}.op_map = #{s};\n"
	$count += 1
end

def make_fun_zip(op,type)
s="mmx_#{type}_zip_#{op}"
	size = $sizeof[type]
	accum = $accum[type]
	sym = $opcodes[op][0]
	opcode = $opcodes[op][1]
	mopcode = $opcodes[op][size+(size<4 ? 1 : 0)]
	return if not mopcode
	AsmFunction.make(s) {|a|
		puts "mov ecx,[ebp+8]",  "mov edi,[ebp+12]",
		     "mov esi,[ebp+16]"#, "mov ebx,[ebp+20]"
		foo = proc {|n|
			a.make_until("cmp ecx,#{8/size*n}","jb near") {
				0.step(n,4) {|k|
				nn=[n-k,4].min
				o=(0..3).map{|x| 8*(x+k) }
				for i in 0...nn do puts "movq mm#{i},[edi+#{o[i]}]" end
				for i in 0...nn do puts "movq mm#{i+4},[esi+#{o[i]}]" end
				for i in 0...nn do puts "#{mopcode} mm#{i},mm#{i+4}" end
				for i in 0...nn do puts "movq [edi+#{o[i]}],mm#{i}" end
				}
				#for i in 0...n do puts "movq [ebx+#{8*i}],mm#{i}" end
				puts "lea edi,[edi+#{8*n}]"
				puts "lea esi,[esi+#{8*n}]"
				#puts "lea ebx,[ebx+#{8*n}]"
				puts "lea ecx,[ecx-#{8/size*n}]"
			}
		}
		foo.call 4
		foo.call 1
		a.make_until("test ecx,ecx", "jz") {
			# requires commutativity ??? fails with shl, shr
			puts "mov #{accum},[esi]"
			puts "#{opcode} #{$asm_type[type]} [edi],#{accum}"
			#puts "mov #{accum},[edi]"
			#puts "#{opcode} #{accum},[esi]"
			#puts "mov [ebx],#{accum}"
			puts "lea edi,[edi+#{size}]"
			puts "lea esi,[esi+#{size}]"
			#puts "lea ebx,[ebx+#{size}]"
			puts "dec ecx"
		}
		puts "emms"
	}
	#$decls << "void #{s}(int,#{type}*,#{type}*,#{type}*);\n"
	$decls << "void #{s}(int,#{type}*,#{type}*);\n"
	$install << "FIX2PTR(Numop,rb_hash_aref(op_dict,SYM(#{sym})))"
	$install << "->on_#{type}.op_zip = #{s};\n"
	$count += 1
end

for op in $opcodes.keys do
	for type in [:uint8, :int16#, :int32
	] do
		make_fun_map(op,type)
		make_fun_zip(op,type)
	end
end

$loader.puts $decls
$loader.puts %`
}; /* extern */
#include <stdlib.h>
void startup_mmx_loader () {/*bogus*/}
void startup_mmx () {
	if (getenv("NO_MMX")) return;
	if (EVAL(\"GridFlow.bridge_name\")!=Qnil) gfpost(\"startup_cpu: using MMX optimisations\");
	#{$install}
}`

STDERR.puts "automatically generated #{$count} MMX asm functions"

=begin notes:
CPUID has a bit for detecting MMX
PACKSSDW PACKSSWB PACKUSWB = saturation-casting
PCMPxx: Compare Packed Integers
PMULHW, PMULLW: Multiply Packed _unsigned_ 16-bit Integers, and Store
PUNPCKxxx: Unpack and Interleave Data
=end