=begin $Id: mmx.rb,v 1.1 2005-10-04 02:02:14 matju Exp $ GridFlow Copyright (c) 2001,2002,2003,2004 by Mathieu Bouchard This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software Foundation; either version 2 of the License, or (at your option) any later version. See file ../COPYING for further informations on licensing terms. This program is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more details. You should have received a copy of the GNU General Public License along with this program; if not, write to the Free Software Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. =end STDOUT.reopen ARGV[0], "w" $loader = File.open ARGV[1], "w" $count = 0 $lines = 0 puts "; generated by/for GridFlow 0.8.0" $loader.puts "#include \"../base/grid.h.fcs\"\nextern \"C\" {" # this class is not really used yet (only self.make) class AsmFunction def initialize(name) @name = name @label_count = 1 end def self.make(name) puts "", "GLOBAL #{name}", "#{name}:" puts "push ebp", "mov ebp,esp", "push esi", "push edi" yield AsmFunction.new(name) puts "pop edi", "pop esi", "leave", "ret", "" end def make_until(*ops) a = @label_count b = @label_count+1 @label_count+=2 ops[-1]<<" #{@name}_#{b}" puts "#{@name}_#{a}: ", *ops yield puts "jmp #{@name}_#{a}" puts "#{@name}_#{b}:" end end $sizeof = { :uint8 => 1, :int16 => 2, :int32 => 4, :int64 => 8, :float32 => 4, :float64 => 8, } $accum = { :uint8 => "al", :int16 => "ax", :int32 => "eax", } $asm_type = { :uint8 => "byte", :int16 => "word", :int32 => "dword", :int64 => "qword", } # in the following, the opcode "_" means no such thing seems available. # also >> for x86 ought to be shr in the uint8 case. # btw, i got all of the MMX information from the NASM manual, Appendix B. $opcodes = { # [--GF--|--x86--|--mmx-et-al----------------------------------------] # [ | |-uint8-|-int16-|-int32-|-int64-|-float32-|-float64-] :add => %w[ + add paddb paddw paddd paddq ], :sub => %w[ - sub psubb psubw psubd psubq ], :and => %w[ & and pand pand pand pand ], :xor => %w[ ^ xor pxor pxor pxor pxor ], :or => %w[ | or por por por por ], # :max => %w[ max _ pmaxub pmaxsw _ _ ], # not plain MMX !!! (req.Katmai) # :min => %w[ min _ pminub pminsw _ _ ], # not plain MMX !!! (req.Katmai) # :eq => %w[ == _ pcmpeqb pcmpeqw pcmpeqd _ ], # :gt => %w[ > _ pcmpgtb pcmpgtw pcmpgtd _ ], # :shl => %w[ << shl _ psllw pslld psllq ], # noncommutative # :shr => %w[ >> sar _ psraw psrad _ ], # noncommutative # :clipadd => %w[ clip+ _ paddusb paddsw _ _ ], # future use # :clipsub => %w[ clip- _ psubusb psubsw _ _ ], # future use # :andnot => %w[ ¬ _ pandn pandn pandn pandn ], # not planned } $opcodes.each {|k,op| op.map! {|x| if x=="_" then nil else x end } STDERR.puts op.inspect } $decls = "" $install = "" def make_fun_map(op,type) s="mmx_#{type}_map_#{op}" size = $sizeof[type] accum = $accum[type] sym = $opcodes[op][0] opcode = $opcodes[op][1] mopcode = $opcodes[op][size+(size<4 ? 1 : 0)] return if not mopcode AsmFunction.make(s) {|a| puts "mov ecx,[ebp+8]", "mov esi,[ebp+12]", "mov eax,[ebp+16]" puts "mov dx,ax", "shl eax,8", "mov al,dl" if size==1 puts "mov edx,eax", "shl eax,16", "mov ax,dx" if size<=2 puts "push eax", "push eax", "movq mm7,[esp]", "add esp,8" foo = proc {|n| a.make_until("cmp ecx,#{8/size*n}","jb near") { 0.step(n,4) {|k| nn=[n-k,4].min o=(0..3).map{|x| 8*(x+k) } for i in 0...nn do puts "movq mm#{i},[esi+#{o[i]}]" end for i in 0...nn do puts "#{mopcode} mm#{i},mm7" end for i in 0...nn do puts "movq [esi+#{o[i]}],mm#{i}" end } puts "lea esi,[esi+#{8*n}]", "lea ecx,[ecx-#{8/size*n}]" } } foo.call 4 foo.call 1 a.make_until("test ecx,ecx", "jz") { puts "#{opcode} #{$asm_type[type]} [esi],#{accum}", "lea esi,[esi+#{size}]" puts "dec ecx" } puts "emms" } $decls << "void #{s}(int,#{type}*,#{type});\n" $install << "FIX2PTR(Numop,rb_hash_aref(op_dict,SYM(#{sym})))" $install << "->on_#{type}.op_map = #{s};\n" $count += 1 end def make_fun_zip(op,type) s="mmx_#{type}_zip_#{op}" size = $sizeof[type] accum = $accum[type] sym = $opcodes[op][0] opcode = $opcodes[op][1] mopcode = $opcodes[op][size+(size<4 ? 1 : 0)] return if not mopcode AsmFunction.make(s) {|a| puts "mov ecx,[ebp+8]", "mov edi,[ebp+12]", "mov esi,[ebp+16]"#, "mov ebx,[ebp+20]" foo = proc {|n| a.make_until("cmp ecx,#{8/size*n}","jb near") { 0.step(n,4) {|k| nn=[n-k,4].min o=(0..3).map{|x| 8*(x+k) } for i in 0...nn do puts "movq mm#{i},[edi+#{o[i]}]" end for i in 0...nn do puts "movq mm#{i+4},[esi+#{o[i]}]" end for i in 0...nn do puts "#{mopcode} mm#{i},mm#{i+4}" end for i in 0...nn do puts "movq [edi+#{o[i]}],mm#{i}" end } #for i in 0...n do puts "movq [ebx+#{8*i}],mm#{i}" end puts "lea edi,[edi+#{8*n}]" puts "lea esi,[esi+#{8*n}]" #puts "lea ebx,[ebx+#{8*n}]" puts "lea ecx,[ecx-#{8/size*n}]" } } foo.call 4 foo.call 1 a.make_until("test ecx,ecx", "jz") { # requires commutativity ??? fails with shl, shr puts "mov #{accum},[esi]" puts "#{opcode} #{$asm_type[type]} [edi],#{accum}" #puts "mov #{accum},[edi]" #puts "#{opcode} #{accum},[esi]" #puts "mov [ebx],#{accum}" puts "lea edi,[edi+#{size}]" puts "lea esi,[esi+#{size}]" #puts "lea ebx,[ebx+#{size}]" puts "dec ecx" } puts "emms" } #$decls << "void #{s}(int,#{type}*,#{type}*,#{type}*);\n" $decls << "void #{s}(int,#{type}*,#{type}*);\n" $install << "FIX2PTR(Numop,rb_hash_aref(op_dict,SYM(#{sym})))" $install << "->on_#{type}.op_zip = #{s};\n" $count += 1 end for op in $opcodes.keys do for type in [:uint8, :int16#, :int32 ] do make_fun_map(op,type) make_fun_zip(op,type) end end $loader.puts $decls $loader.puts %` }; /* extern */ #include void startup_mmx_loader () {/*bogus*/} void startup_mmx () { if (getenv("NO_MMX")) return; if (EVAL(\"GridFlow.bridge_name\")!=Qnil) gfpost(\"startup_cpu: using MMX optimisations\"); #{$install} }` STDERR.puts "automatically generated #{$count} MMX asm functions" =begin notes: CPUID has a bit for detecting MMX PACKSSDW PACKSSWB PACKUSWB = saturation-casting PCMPxx: Compare Packed Integers PMULHW, PMULLW: Multiply Packed _unsigned_ 16-bit Integers, and Store PUNPCKxxx: Unpack and Interleave Data =end