1.0.3.23: fix sb-posix timeval struct

[sbcl.git] / OPTIMIZATIONS
diff --git a/OPTIMIZATIONS b/OPTIMIZATIONS

index eb2931f..d234ce3 100644 (file)
--- a/OPTIMIZATIONS
+++ b/OPTIMIZATIONS
@@ -224,3 +224,189 @@ Initialization of stack-allocated arrays is inefficient: we always
  fill the vector with zeroes, even when it is not needed (as for
  platforms with conservative GC or for arrays of unboxed objectes) and
  is performed later explicitely.
  fill the vector with zeroes, even when it is not needed (as for
  platforms with conservative GC or for arrays of unboxed objectes) and
  is performed later explicitely.
+--------------------------------------------------------------------------------
+#28
+a. Accessing raw slots in structure instances is more inefficient than
+it could be; if we placed raw slots before the header word, we would
+not need to do arithmetic at runtime to access them.  (But beware:
+this would complicate handling of the interior pointer).
+
+b. (Also note that raw slots are currently disabled on HPPA)
+--------------------------------------------------------------------------------
+#29
+Python is overly zealous when converting high-level CL functions, such
+as MIN/MAX, LOGBITP, and LOGTEST, to low-level CL functions.  Reducing
+Python's aggressiveness would make it easier to effect changes such as
+
+x86-64:
+* direct MIN/MAX on {SINGLE,DOUBLE}-FLOATs ({MIN,MAX}S{S,D})
+
+x86-64:
+* direct LOGBITP on word-sized integers and fixnums (BT + JC)
+
+x86{,-64}/PPC:
+* branch-free MIN/MAX on word-sized integers and fixnums (floats could
+  be handled too, modulo safety considerations on the PPC)
+
+x86-64:
+* efficient LOGTESTs on word-sized integers and fixnums (TEST)
+
+etc., etc.
+
+(The framework for this has been implemented as of 0.9.9.18; see the
+vm-support-routine COMBINATION-IMPLEMENTATION-STYLE and its use in
+src/compiler/ir1opt.lisp, IR1-OPTIMIZE-COMBINATION.  The above
+optimizations are left as an exercise for the reader.)
+--------------------------------------------------------------------------------
+#30
+(defun foo (x y)
+  (< x y))
+
+FOO's IR1 representation is roughly:
+
+(defun foo (x y)
+  (if (< x y)
+      T
+      NIL))
+
+However, if a full call is generated for < (and similarly for other
+predicate functions), then the IF is unnecessary, since the return value
+of (< x y) is already T or NIL.
+--------------------------------------------------------------------------------
+#31
+The typecheck generated for a declaration like (integer 0 45) on x86 looks
+like:
+
+;      12B:       F6C203           TEST DL, 3
+;      12E:       753B             JNE L1
+;      130:       8BC2             MOV EAX, EDX
+;      132:       83F800           CMP EAX, 0
+;      135:       7C34             JL L1
+;      137:       8BC2             MOV EAX, EDX
+;      139:       3DB4000000       CMP EAX, 180
+;      13E:       7F2B             JNLE L1
+
+A better code sequence for this would be:
+
+  TEST DL, 3
+  JNE L1
+  MOV EAX, EDX
+  CMP EAX, 180
+  JBE L1
+
+Doing an unsigned comparison means that, similarly to %CHECK-BOUND, we can
+combine the <0 and >=bound tests.  This sort of test is generated often
+in SBCL and any array-based code that's serious about type-checking its
+indices.
+--------------------------------------------------------------------------------
+#32
+The code for a vector bounds check on x86 (similarly on x86-64) where
+the vector is in EDX and the index in EAX looks like:
+
+;       49: L0:   8B5AFD           MOV EBX, [EDX-3]
+;       4C:       39C3             CMP EBX, EAX
+;       4E:       7632             JBE L2
+
+because %CHECK-BOUND is used for bounds-checking any array dimension.
+A more efficient specialization (%CHECK-BOUND/VECTOR) would produce:
+
+  CMP [EDX-3], EAX
+  JBE L2
+
+Which is slightly shorter and avoids using a register.
+--------------------------------------------------------------------------------
+#33
+Reports from the Java camp indicate that using an SSE2-based
+floating-point backend on x86 when possible is highly preferable to
+using the x86 FP stack.  It would be nice if SBCL included an SSE2-based
+floating point backend with a compile-time option to switch between the
+two.
+--------------------------------------------------------------------------------
+#34
+Compiling
+
+(defun foo (x y)
+  (declare (type (integer 0 45) x y))
+  (+ x y))
+
+results in the following error trapping code for type-checking the
+arguments:
+
+;      424: L0:   8B058CE31812     MOV EAX, [#x1218E38C]      ; '(MOD 46)
+;      42A:       0F0B0A           BREAK 10                   ; error trap
+;      42D:       05               BYTE #X05
+;      42E:       1F               BYTE #X1F                  ; OBJECT-NOT-TYPE-ERROR
+;      42F:       FECE01           BYTE #XFE, #XCE, #X01      ; EDI
+;      432:       0E               BYTE #X0E                  ; EAX
+;      433: L1:   8B0590E31812     MOV EAX, [#x1218E390]      ; '(MOD 46)
+;      439:       0F0B0A           BREAK 10                   ; error trap
+;      43C:       03               BYTE #X03
+;      43D:       1F               BYTE #X1F                  ; OBJECT-NOT-TYPE-ERROR
+;      43E:       8E               BYTE #X8E                  ; EDX
+;      43F:       0E               BYTE #X0E                  ; EAX
+
+Notice that '(MOD 46) has two entries in the constant vector.  Having
+one would be preferable.
+--------------------------------------------------------------------------------
+#35
+Compiling
+
+(defun foo (a i)
+  (declare (type simple-vector a))
+  (aref a i))
+
+results in the following x86 code:
+
+; 115886E9:       F7C703000000     TEST EDI, 3                ; no-arg-parsing entry point
+;      6EF:       7510             JNE L0
+;      6F1:       8BC7             MOV EAX, EDI
+;      6F3:       83F800           CMP EAX, 0
+;      6F6:       7C09             JL L0
+;      6F8:       8BC7             MOV EAX, EDI
+;      6FA:       3DF8FFFF7F       CMP EAX, 2147483640
+;      6FF:       7E0F             JLE L1
+;      701: L0:   8B057C865811     MOV EAX, [#x1158867C]      ; '(MOD
+                                                              ;   536870911)
+;      707:       0F0B0A           BREAK 10                   ; error trap
+;      70A:       05               BYTE #X05
+;      70B:       1F               BYTE #X1F                  ; OBJECT-NOT-TYPE-ERROR
+;      70C:       FECE01           BYTE #XFE, #XCE, #X01      ; EDI
+;      70F:       0E               BYTE #X0E                  ; EAX
+;      710: L1:   8B42FD           MOV EAX, [EDX-3]
+;      713:       8BCF             MOV ECX, EDI
+;      715:       39C8             CMP EAX, ECX
+;      717:       7620             JBE L2
+;      719:       8B540A01         MOV EDX, [EDX+ECX+1]
+
+... plus the standard return sequence and some error blocks.  The
+`TEST EDI, 3' and associated comparisons are to ensure that `I' is a
+positive fixnum.  The associated comparisons are unnecessary, as the
+%CHECK-BOUND VOP only requires its tested index to be a fixnum and takes
+care of the negative fixnum case itself.
+
+{HAIRY-,}DATA-VECTOR-REF are DEFKNOWN'd with EXPLICIT-CHECK, which would
+seem to take care of this, but EXPLICIT-CHECK only seems to be used when
+compiling calls to unknown functions or similar.  Furthermore,
+EXPLICIT-CHECK, as NJF understands it, doesn't have the right
+semantics--it suppresses all type checking of arguments, whereas what we
+really want is to ensure that the argument is a fixnum, but not check
+its positiveness.
+--------------------------------------------------------------------------------
+#36
+
+In #35, the CMP EAX, $foo instructions are all preceded by a MOV.  They
+appear to be unnecessary, but are necessary because in IR2, EDI is a
+DESCRIPTOR-REG, whereas EAX is an ANY-REG--and the comparison VOPs only
+accept ANY-REGs.  Therefore, the MOVs are "necessary" to ensure that the
+comparison VOP receives an TN of the appropriate storage class.
+
+Obviously, it would be better if a) we only performed one MOV prior to
+all three comparisons or b) eliminated the necessity of the MOV(s)
+altogether.  The former option is probably easier than the latter.
+
+--------------------------------------------------------------------------------
+#37
+
+Dynamic extent allocation doesn't currently work for one-element lists,
+since there's a source transform from (LIST X) to (CONS X NIL).
+