OPTIMIZATIONS

   1 #1
   2 (defun mysl (s)
   3     (declare (simple-string s))
   4     (declare (optimize (speed 3) (safety 0) (debug 0)))
   5     (let ((c 0))
   6       (declare (fixnum c))
   7       (dotimes (i (length s))
   8         (when (eql (aref s i) #\1)
   9           (incf c)))
  10       c))
  11
  12 * On X86 I is represented as a tagged integer.
  13
  14 * Unnecessary move:
  15   3: SLOT S!11[EDX] {SB-C::VECTOR-LENGTH 1 7} => t23[EAX]
  16   4: MOVE t23[EAX] => t24[EBX]
  17
  18 --------------------------------------------------------------------------------
  19 #2
  20 (defun quux (v)
  21   (declare (optimize (speed 3) (safety 0) (space 2) (debug 0)))
  22   (declare (type (simple-array double-float 1) v))
  23   (let ((s 0d0))
  24     (declare (type double-float s))
  25     (dotimes (i (length v))
  26       (setq s (+ s (aref v i))))
  27     s))
  28
  29 * Python does not combine + with AREF, so generates extra move and
  30   allocates a register.
  31
  32 * On X86 Python thinks that all FP registers are directly accessible
  33   and emits costy MOVE ... => FR1.
  34
  35 --------------------------------------------------------------------------------
  36 #3
  37 (defun bar (n)
  38   (declare (optimize (speed 3) (safety 0) (space 2))
  39            (type fixnum n))
  40   (let ((v (make-list n)))
  41     (setq v (make-array n))
  42     (length v)))
  43
  44 * IR1 does not optimize away (MAKE-LIST N).
  45 --------------------------------------------------------------------------------
  46 #4
  47 (defun bar (v1 v2)
  48   (declare (optimize (speed 3) (safety 0) (space 2))
  49            (type (simple-array base-char 1) v1 v2))
  50   (dotimes (i (length v1))
  51     (setf (aref v2 i) (aref v1 i))))
  52
  53 VOP DATA-VECTOR-SET/SIMPLE-STRING V2!14[EDI] t32[EAX] t30[S2]>t33[CL]
  54                                   => t34[S2]<t35[AL]
  55         MOV     #<TN t33[CL]>, #<TN t30[S2]>
  56         MOV     BYTE PTR [EDI+EAX+1], #<TN t33[CL]>
  57         MOV     #<TN t35[AL]>, #<TN t33[CL]>
  58         MOV     #<TN t34[S2]>, #<TN t35[AL]>
  59
  60 * The value of DATA-VECTOR-SET is not used, so there is no need in the
  61   last two moves.
  62
  63 * And why two moves?
  64 --------------------------------------------------------------------------------
  65 #8
  66 (defun foo (d)
  67   (declare (optimize (speed 3) (safety 0) (debug 0)))
  68   (declare (type (double-float 0d0 1d0) d))
  69   (loop for i fixnum from 1 to 5
  70         for x1 double-float = (sin d) ;;; !!!
  71         do (loop for j fixnum from 1 to 4
  72                  sum x1 double-float)))
  73
  74 Without the marked declaration Python will use boxed representation for X1.
  75
  76 This is equivalent to
  77
  78 (let ((x nil))
  79   (setq x 0d0)
  80   ;; use of X as DOUBLE-FLOAT
  81 )
  82
  83 The initial binding is effectless, and without it X is of type
  84 DOUBLE-FLOAT. Unhopefully, IR1 does not optimize away effectless
  85 SETs/bindings, and IR2 does not perform type inference.
  86 --------------------------------------------------------------------------------
  87 #9 "Multi-path constant folding"
  88 (defun foo (x)
  89   (if (= (cond ((irgh x) 0)
  90                ((buh x) 1)
  91                (t 2))
  92          0)
  93       :yes
  94       :no))
  95
  96 This code could be optimized to
  97
  98 (defun foo (x)
  99   (cond ((irgh x) :yes)
 100         ((buh x) :no)
 101         (t :no)))
 102 --------------------------------------------------------------------------------
 103 #11
 104 (inverted variant of #9)
 105
 106 (lambda (x)
 107   (let ((y (sap-alien x c-string)))
 108     (list (alien-sap y)
 109           (alien-sap y))))
 110
 111 It could be optimized to
 112
 113 (lambda (x) (list x x))
 114
 115 (if Y were used only once, the current compiler would optimize it)
 116 --------------------------------------------------------------------------------
 117 #12
 118 (typep (truly-the (simple-array * (*)) x) 'simple-vector)
 119
 120 tests lowtag.
 121 --------------------------------------------------------------------------------
 122 #13
 123 FAST-+/FIXNUM and similar should accept unboxed arguments in interests
 124 of representation selection. Problem: inter-TN dependencies.
 125 --------------------------------------------------------------------------------
 126 #14
 127 The derived type of (/ (THE (DOUBLE-FLOAT (0D0)) X) (THE (DOUBLE-FLOAT
 128 1D0) Y)) is (DOUBLE-FLOAT 0.0d0). While it might be reasonable, it is
 129 better to derive (OR (MEMBER 0.0d0) (DOUBLE-FLOAT (0.0d0))).
 130 --------------------------------------------------------------------------------
 131 #15
 132 On the alpha, the system is reluctant to refer directly to a constant bignum,
 133 preferring to load a large constant through a slow sequence of instructions,
 134 then cons up a bignum for it:
 135
 136 (LAMBDA (A)
 137   (DECLARE (OPTIMIZE (SAFETY 1) (SPEED 3) (DEBUG 1))
 138            (TYPE (INTEGER -10000 10000) A)
 139            (IGNORABLE A))
 140   (CASE A
 141     ((89 125 16) (ASH A (MIN 18 -706)))
 142     (T (DPB -3 (BYTE 30 30) -1))))
 143 --------------------------------------------------------------------------------
 144 #16
 145 (do ((i 0 (1+ i)))
 146     ((= i (the (integer 0 100) n)))
 147   ...)
 148
 149 It is commonly expected for Python to derive (FIXNUMP I). (If ``='' is
 150 replaced with ``>='', Python will do.)
 151 --------------------------------------------------------------------------------
 152 #17
 153 Type tests for (ARRAY BIT), (ARRAY T) and similar go through full
 154 %TYPEP, even though it is relatively simple to establish the arrayness
 155 of an object and also to obtain the element type of an array.  As of
 156 sbcl-0.8.12.30, this affects at least DUMP-OBJECT through
 157 COMPOUND-OBJECT-P, and (LABELS MAYBE-EMIT-MAKE-LOAD-FORMS GROVEL)
 158 through TYPEP UNBOXED-ARRAY, within the compiler itself.
 159 --------------------------------------------------------------------------------
 160 #18
 161 (lambda (x) (declare (null x)) (sxhash x)) goes through SYMBOL-HASH
 162 rather than either constant-folding or manipulating NIL-VALUE or
 163 NULL-TN directly.
 164 --------------------------------------------------------------------------------
 165 #19
 166   (let ((dx (if (foo)
 167                 (list x)
 168                 (list y z))))
 169     (declare (dynamic-extent dx))
 170     ...)
 171
 172 DX is not allocated on stack.
 173 --------------------------------------------------------------------------------
 174 #20
 175 (defun-with-dx foo (x)
 176   (flet ((make (x)
 177            (let ((l (list nil nil)))
 178              (setf (first l) x)
 179              (setf (second l) (1- x))
 180              l)))
 181     (let ((l (make x)))
 182       (declare (dynamic-extent l))
 183       (mapc #'print l))))
 184
 185 Result of MAKE is not stack allocated, which means that
 186 stack-allocation of structures is impossible.
 187 --------------------------------------------------------------------------------
 188 #21
 189 (defun-with-dx foo ()
 190   (let ((dx (list (list 1 2) (list 3 4))))
 191     (declare (dynamic-extent dx))
 192     ...))
 193
 194 External list in DX is allocated on stack, but internal are not.
 195 --------------------------------------------------------------------------------
 196 #22
 197 IR2 does not perform unused code flushing.
 198 --------------------------------------------------------------------------------
 199 #23
 200 Python does not know that &REST lists are LISTs (and cannot derive it).
 201 --------------------------------------------------------------------------------
 202 #24
 203 a. Iterations on &REST lists, returning them as VALUES could be
 204    rewritten with &MORE vectors.
 205 b. Implement local unknown-values mv-call (useful for fast type checking).
 206 --------------------------------------------------------------------------------
 207 #26
 208 SBCL cannot derive upper bound for I and uses generic arithmetic here:
 209
 210 (defun foo (l)
 211   (declare (vector l))
 212   (dotimes (i (length l))
 213     (if (block nil
 214           (map-foo (lambda (x) (if x (return t)))
 215                    l))
 216         t
 217         nil)))
 218
 219 (So the constraint propagator or a possible future SSA-convertor
 220 should know the connection between an NLE and its CLEANUP.)
 221 --------------------------------------------------------------------------------
 222 #27
 223 Initialization of stack-allocated arrays is inefficient: we always
 224 fill the vector with zeroes, even when it is not needed (as for
 225 platforms with conservative GC or for arrays of unboxed objectes) and
 226 is performed later explicitely.
 227 --------------------------------------------------------------------------------
 228 #28
 229 a. Accessing raw slots in structure instances is more inefficient than
 230 it could be; if we placed raw slots before the header word, we would
 231 not need to do arithmetic at runtime to access them.  (But beware:
 232 this would complicate handling of the interior pointer).
 233
 234 b. (Also note that raw slots are currently disabled on HPPA)
 235 --------------------------------------------------------------------------------
 236 #29
 237 Python is overly zealous when converting high-level CL functions, such
 238 as MIN/MAX, LOGBITP, and LOGTEST, to low-level CL functions.  Reducing
 239 Python's aggressiveness would make it easier to effect changes such as
 240
 241 x86-64:
 242 * direct MIN/MAX on {SINGLE,DOUBLE}-FLOATs ({MIN,MAX}S{S,D})
 243
 244 x86{,-64}:
 245 * direct LOGBITP on word-sized integers and fixnums (BT + JC)
 246
 247 x86{,-64}/PPC:
 248 * branch-free MIN/MAX on word-sized integers and fixnums
 249 * efficient LOGTESTs on word-sized integers and fixnums (TEST / AND.)
 250
 251 PPC:
 252 * efficient LDB on word-sized integers and fixnums (RLWINM)
 253
 254 etc., etc.
 255
 256 The "easier" part claimed above would come about because the functions
 257 would be available for :TRANSLATE through a VOP or similar, whereas with
 258 the current architecture, one would have to pattern-match IR1.  While
 259 IR1 pattern-matching would be useful in other contexts, it seems better
 260 here to attempt the direct :TRANSLATE route.
 261
 262 I (NJF) don't know how to implement such architecture-specific
 263 optimizations whilst keeping the high->low transformations for other
 264 architectures.  Certainly adding #!+/- magic in compiler/*.lisp could be
 265 done, but such a solution is somewhat inelegant.  Moving the relevant
 266 DEFTRANSFORMs to the architecture-specific compiler/* areas is also
 267 possible, but that would duplicate quite a bit of code.
 268 --------------------------------------------------------------------------------
 269 #30
 270 (defun foo (x y)
 271   (< x y))
 272
 273 FOO's IR1 representation is roughly:
 274
 275 (defun foo (x y)
 276   (if (< x y)
 277       T
 278       NIL))
 279
 280 However, if a full call is generated for < (and similarly for other
 281 predicate functions), then the IF is unnecessary, since the return value
 282 of (< x y) is already T or NIL.