root/arch/sparc/lib/M7memcpy.S

/* [<][>][^][v][top][bottom][index][help] */
   1 /*
   2  * M7memcpy: Optimized SPARC M7 memcpy
   3  *
   4  * Copyright (c) 2016, Oracle and/or its affiliates. All rights reserved.
   5  */
   6 
   7         .file   "M7memcpy.S"
   8 
   9 /*
  10  * memcpy(s1, s2, len)
  11  *
  12  * Copy s2 to s1, always copy n bytes.
  13  * Note: this C code does not work for overlapped copies.
  14  *
  15  * Fast assembler language version of the following C-program for memcpy
  16  * which represents the `standard' for the C-library.
  17  *
  18  *      void *
  19  *      memcpy(void *s, const void *s0, size_t n)
  20  *      {
  21  *              if (n != 0) {
  22  *                  char *s1 = s;
  23  *                  const char *s2 = s0;
  24  *                  do {
  25  *                      *s1++ = *s2++;
  26  *                  } while (--n != 0);
  27  *              }
  28  *              return (s);
  29  *      }
  30  *
  31  *
  32  * SPARC T7/M7 Flow :
  33  *
  34  * if (count < SMALL_MAX) {
  35  *   if count < SHORTCOPY              (SHORTCOPY=3)
  36  *      copy bytes; exit with dst addr
  37  *   if src & dst aligned on word boundary but not long word boundary,
  38  *     copy with ldw/stw; branch to finish_up
  39  *   if src & dst aligned on long word boundary
  40  *     copy with ldx/stx; branch to finish_up
  41  *   if src & dst not aligned and length <= SHORTCHECK   (SHORTCHECK=14)
  42  *     copy bytes; exit with dst addr
  43  *   move enough bytes to get src to word boundary
  44  *   if dst now on word boundary
  45  * move_words:
  46  *     copy words; branch to finish_up
  47  *   if dst now on half word boundary
  48  *     load words, shift half words, store words; branch to finish_up
  49  *   if dst on byte 1
  50  *     load words, shift 3 bytes, store words; branch to finish_up
  51  *   if dst on byte 3
  52  *     load words, shift 1 byte, store words; branch to finish_up
  53  * finish_up:
  54  *     copy bytes; exit with dst addr
  55  * } else {                                         More than SMALL_MAX bytes
  56  *   move bytes until dst is on long word boundary
  57  *   if( src is on long word boundary ) {
  58  *     if (count < MED_MAX) {
  59  * finish_long:                                    src/dst aligned on 8 bytes
  60  *       copy with ldx/stx in 8-way unrolled loop;
  61  *       copy final 0-63 bytes; exit with dst addr
  62  *     } else {                              src/dst aligned; count > MED_MAX
  63  *       align dst on 64 byte boundary; for main data movement:
  64  *       prefetch src data to L2 cache; let HW prefetch move data to L1 cache
  65  *       Use BIS (block initializing store) to avoid copying store cache
  66  *       lines from memory. But pre-store first element of each cache line
  67  *       ST_CHUNK lines in advance of the rest of that cache line. That
  68  *       gives time for replacement cache lines to be written back without
  69  *       excess STQ and Miss Buffer filling. Repeat until near the end,
  70  *       then finish up storing before going to finish_long.
  71  *     }
  72  *   } else {                                   src/dst not aligned on 8 bytes
  73  *     if src is word aligned and count < MED_WMAX
  74  *       move words in 8-way unrolled loop
  75  *       move final 0-31 bytes; exit with dst addr
  76  *     if count < MED_UMAX
  77  *       use alignaddr/faligndata combined with ldd/std in 8-way
  78  *       unrolled loop to move data.
  79  *       go to unalign_done
  80  *     else
  81  *       setup alignaddr for faligndata instructions
  82  *       align dst on 64 byte boundary; prefetch src data to L1 cache
  83  *       loadx8, falign, block-store, prefetch loop
  84  *       (only use block-init-store when src/dst on 8 byte boundaries.)
  85  * unalign_done:
  86  *       move remaining bytes for unaligned cases. exit with dst addr.
  87  * }
  88  *
  89  */
  90 
  91 #include <asm/visasm.h>
  92 #include <asm/asi.h>
  93 
  94 #if !defined(EX_LD) && !defined(EX_ST)
  95 #define NON_USER_COPY
  96 #endif
  97 
  98 #ifndef EX_LD
  99 #define EX_LD(x,y)      x
 100 #endif
 101 #ifndef EX_LD_FP
 102 #define EX_LD_FP(x,y)   x
 103 #endif
 104 
 105 #ifndef EX_ST
 106 #define EX_ST(x,y)      x
 107 #endif
 108 #ifndef EX_ST_FP
 109 #define EX_ST_FP(x,y)   x
 110 #endif
 111 
 112 #ifndef EX_RETVAL
 113 #define EX_RETVAL(x)    x
 114 #endif
 115 
 116 #ifndef LOAD
 117 #define LOAD(type,addr,dest)    type [addr], dest
 118 #endif
 119 
 120 #ifndef STORE
 121 #define STORE(type,src,addr)    type src, [addr]
 122 #endif
 123 
 124 /*
 125  * ASI_BLK_INIT_QUAD_LDD_P/ASI_BLK_INIT_QUAD_LDD_S marks the cache
 126  * line as "least recently used" which means if many threads are
 127  * active, it has a high probability of being pushed out of the cache
 128  * between the first initializing store and the final stores.
 129  * Thus, we use ASI_ST_BLKINIT_MRU_P/ASI_ST_BLKINIT_MRU_S which
 130  * marks the cache line as "most recently used" for all
 131  * but the last cache line
 132  */
 133 #ifndef STORE_ASI
 134 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
 135 #define STORE_ASI       ASI_BLK_INIT_QUAD_LDD_P
 136 #else
 137 #define STORE_ASI       0x80            /* ASI_P */
 138 #endif
 139 #endif
 140 
 141 #ifndef STORE_MRU_ASI
 142 #ifndef SIMULATE_NIAGARA_ON_NON_NIAGARA
 143 #define STORE_MRU_ASI   ASI_ST_BLKINIT_MRU_P
 144 #else
 145 #define STORE_MRU_ASI   0x80            /* ASI_P */
 146 #endif
 147 #endif
 148 
 149 #ifndef STORE_INIT
 150 #define STORE_INIT(src,addr)    stxa src, [addr] STORE_ASI
 151 #endif
 152 
 153 #ifndef STORE_INIT_MRU
 154 #define STORE_INIT_MRU(src,addr)        stxa src, [addr] STORE_MRU_ASI
 155 #endif
 156 
 157 #ifndef FUNC_NAME
 158 #define FUNC_NAME       M7memcpy
 159 #endif
 160 
 161 #ifndef PREAMBLE
 162 #define PREAMBLE
 163 #endif
 164 
 165 #define BLOCK_SIZE      64
 166 #define SHORTCOPY       3
 167 #define SHORTCHECK      14
 168 #define SHORT_LONG      64      /* max copy for short longword-aligned case */
 169                                 /* must be at least 64 */
 170 #define SMALL_MAX       128
 171 #define MED_UMAX        1024    /* max copy for medium un-aligned case */
 172 #define MED_WMAX        1024    /* max copy for medium word-aligned case */
 173 #define MED_MAX         1024    /* max copy for medium longword-aligned case */
 174 #define ST_CHUNK        24      /* ST_CHUNK - block of values for BIS Store */
 175 #define ALIGN_PRE       24      /* distance for aligned prefetch loop */
 176 
 177         .register       %g2,#scratch
 178 
 179         .section        ".text"
 180         .global         FUNC_NAME
 181         .type           FUNC_NAME, #function
 182         .align          16
 183 FUNC_NAME:
 184         srlx            %o2, 31, %g2
 185         cmp             %g2, 0
 186         tne             %xcc, 5
 187         PREAMBLE
 188         mov             %o0, %g1        ! save %o0
 189         brz,pn          %o2, .Lsmallx
 190          cmp            %o2, 3
 191         ble,pn          %icc, .Ltiny_cp
 192          cmp            %o2, 19
 193         ble,pn          %icc, .Lsmall_cp
 194          or             %o0, %o1, %g2
 195         cmp             %o2, SMALL_MAX
 196         bl,pn           %icc, .Lmedium_cp
 197          nop
 198 
 199 .Lmedium:
 200         neg     %o0, %o5
 201         andcc   %o5, 7, %o5             ! bytes till DST 8 byte aligned
 202         brz,pt  %o5, .Ldst_aligned_on_8
 203 
 204         ! %o5 has the bytes to be written in partial store.
 205          sub    %o2, %o5, %o2
 206         sub     %o1, %o0, %o1           ! %o1 gets the difference
 207 7:                                      ! dst aligning loop
 208         add     %o1, %o0, %o4
 209         EX_LD(LOAD(ldub, %o4, %o4), memcpy_retl_o2_plus_o5)     ! load one byte
 210         subcc   %o5, 1, %o5
 211         EX_ST(STORE(stb, %o4, %o0), memcpy_retl_o2_plus_o5_plus_1)
 212         bgu,pt  %xcc, 7b
 213          add    %o0, 1, %o0             ! advance dst
 214         add     %o1, %o0, %o1           ! restore %o1
 215 .Ldst_aligned_on_8:
 216         andcc   %o1, 7, %o5
 217         brnz,pt %o5, .Lsrc_dst_unaligned_on_8
 218          nop
 219 
 220 .Lsrc_dst_aligned_on_8:
 221         ! check if we are copying MED_MAX or more bytes
 222         set MED_MAX, %o3
 223         cmp %o2, %o3                    ! limit to store buffer size
 224         bgu,pn  %xcc, .Llarge_align8_copy
 225          nop
 226 
 227 /*
 228  * Special case for handling when src and dest are both long word aligned
 229  * and total data to move is less than MED_MAX bytes
 230  */
 231 .Lmedlong:
 232         subcc   %o2, 63, %o2            ! adjust length to allow cc test
 233         ble,pn  %xcc, .Lmedl63          ! skip big loop if less than 64 bytes
 234          nop
 235 .Lmedl64:
 236         EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_63)      ! load
 237         subcc   %o2, 64, %o2            ! decrement length count
 238         EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_63_64)  ! and store
 239         EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_63_56) ! a block of 64
 240         EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_63_56)
 241         EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_63_48)
 242         EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_63_48)
 243         EX_LD(LOAD(ldx, %o1+24, %o3), memcpy_retl_o2_plus_63_40)
 244         EX_ST(STORE(stx, %o3, %o0+24), memcpy_retl_o2_plus_63_40)
 245         EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_63_32)! load and store
 246         EX_ST(STORE(stx, %o4, %o0+32), memcpy_retl_o2_plus_63_32)
 247         EX_LD(LOAD(ldx, %o1+40, %o3), memcpy_retl_o2_plus_63_24)! a block of 64
 248         add     %o1, 64, %o1            ! increase src ptr by 64
 249         EX_ST(STORE(stx, %o3, %o0+40), memcpy_retl_o2_plus_63_24)
 250         EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_63_16)
 251         add     %o0, 64, %o0            ! increase dst ptr by 64
 252         EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_63_16)
 253         EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_63_8)
 254         bgu,pt  %xcc, .Lmedl64          ! repeat if at least 64 bytes left
 255          EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_63_8)
 256 .Lmedl63:
 257         addcc   %o2, 32, %o2            ! adjust remaining count
 258         ble,pt  %xcc, .Lmedl31          ! to skip if 31 or fewer bytes left
 259          nop
 260         EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_31)      ! load
 261         sub     %o2, 32, %o2            ! decrement length count
 262         EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_31_32)  ! and store
 263         EX_LD(LOAD(ldx, %o1+8, %o3), memcpy_retl_o2_plus_31_24) ! a block of 32
 264         add     %o1, 32, %o1            ! increase src ptr by 32
 265         EX_ST(STORE(stx, %o3, %o0+8), memcpy_retl_o2_plus_31_24)
 266         EX_LD(LOAD(ldx, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
 267         add     %o0, 32, %o0            ! increase dst ptr by 32
 268         EX_ST(STORE(stx, %o4, %o0-16), memcpy_retl_o2_plus_31_16)
 269         EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_31_8)
 270         EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_31_8)
 271 .Lmedl31:
 272         addcc   %o2, 16, %o2            ! adjust remaining count
 273         ble,pt  %xcc, .Lmedl15          ! skip if 15 or fewer bytes left
 274          nop                            !
 275         EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_15)
 276         add     %o1, 16, %o1            ! increase src ptr by 16
 277         EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_15)
 278         sub     %o2, 16, %o2            ! decrease count by 16
 279         EX_LD(LOAD(ldx, %o1-8, %o3), memcpy_retl_o2_plus_15_8)
 280         add     %o0, 16, %o0            ! increase dst ptr by 16
 281         EX_ST(STORE(stx, %o3, %o0-8), memcpy_retl_o2_plus_15_8)
 282 .Lmedl15:
 283         addcc   %o2, 15, %o2            ! restore count
 284         bz,pt   %xcc, .Lsmallx  ! exit if finished
 285          cmp    %o2, 8
 286         blt,pt  %xcc, .Lmedw7           ! skip if 7 or fewer bytes left
 287          tst    %o2
 288         EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)      ! load 8 bytes
 289         add     %o1, 8, %o1             ! increase src ptr by 8
 290         add     %o0, 8, %o0             ! increase dst ptr by 8
 291         subcc   %o2, 8, %o2             ! decrease count by 8
 292         bnz,pn  %xcc, .Lmedw7
 293          EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)   ! and store 8
 294         retl
 295          mov    EX_RETVAL(%g1), %o0     ! restore %o0
 296 
 297         .align 16
 298 .Lsrc_dst_unaligned_on_8:
 299         ! DST is 8-byte aligned, src is not
 300 2:
 301         andcc   %o1, 0x3, %o5           ! test word alignment
 302         bnz,pt  %xcc, .Lunalignsetup    ! branch to skip if not word aligned
 303          nop
 304 
 305 /*
 306  * Handle all cases where src and dest are aligned on word
 307  * boundaries. Use unrolled loops for better performance.
 308  * This option wins over standard large data move when
 309  * source and destination is in cache for.Lmedium
 310  * to short data moves.
 311  */
 312         set MED_WMAX, %o3
 313         cmp %o2, %o3                    ! limit to store buffer size
 314         bge,pt  %xcc, .Lunalignrejoin   ! otherwise rejoin main loop
 315          nop
 316 
 317         subcc   %o2, 31, %o2            ! adjust length to allow cc test
 318                                         ! for end of loop
 319         ble,pt  %xcc, .Lmedw31          ! skip big loop if less than 16
 320 .Lmedw32:
 321         EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_31)! move a block of 32
 322         sllx    %o4, 32, %o5
 323         EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_31)
 324         or      %o4, %o5, %o5
 325         EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_31)
 326         subcc   %o2, 32, %o2            ! decrement length count
 327         EX_LD(LOAD(ld, %o1+8, %o4), memcpy_retl_o2_plus_31_24)
 328         sllx    %o4, 32, %o5
 329         EX_LD(LOAD(ld, %o1+12, %o4), memcpy_retl_o2_plus_31_24)
 330         or      %o4, %o5, %o5
 331         EX_ST(STORE(stx, %o5, %o0+8), memcpy_retl_o2_plus_31_24)
 332         add     %o1, 32, %o1            ! increase src ptr by 32
 333         EX_LD(LOAD(ld, %o1-16, %o4), memcpy_retl_o2_plus_31_16)
 334         sllx    %o4, 32, %o5
 335         EX_LD(LOAD(ld, %o1-12, %o4), memcpy_retl_o2_plus_31_16)
 336         or      %o4, %o5, %o5
 337         EX_ST(STORE(stx, %o5, %o0+16), memcpy_retl_o2_plus_31_16)
 338         add     %o0, 32, %o0            ! increase dst ptr by 32
 339         EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_31_8)
 340         sllx    %o4, 32, %o5
 341         EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_31_8)
 342         or      %o4, %o5, %o5
 343         bgu,pt  %xcc, .Lmedw32          ! repeat if at least 32 bytes left
 344          EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_31_8)
 345 .Lmedw31:
 346         addcc   %o2, 31, %o2            ! restore count
 347 
 348         bz,pt   %xcc, .Lsmallx  ! exit if finished
 349          nop
 350         cmp     %o2, 16
 351         blt,pt  %xcc, .Lmedw15
 352          nop
 353         EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)! move a block of 16 bytes
 354         sllx    %o4, 32, %o5
 355         subcc   %o2, 16, %o2            ! decrement length count
 356         EX_LD(LOAD(ld, %o1+4, %o4), memcpy_retl_o2_plus_16)
 357         or      %o4, %o5, %o5
 358         EX_ST(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_16)
 359         add     %o1, 16, %o1            ! increase src ptr by 16
 360         EX_LD(LOAD(ld, %o1-8, %o4), memcpy_retl_o2_plus_8)
 361         add     %o0, 16, %o0            ! increase dst ptr by 16
 362         sllx    %o4, 32, %o5
 363         EX_LD(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_8)
 364         or      %o4, %o5, %o5
 365         EX_ST(STORE(stx, %o5, %o0-8), memcpy_retl_o2_plus_8)
 366 .Lmedw15:
 367         bz,pt   %xcc, .Lsmallx  ! exit if finished
 368          cmp    %o2, 8
 369         blt,pn  %xcc, .Lmedw7           ! skip if 7 or fewer bytes left
 370          tst    %o2
 371         EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)       ! load 4 bytes
 372         subcc   %o2, 8, %o2             ! decrease count by 8
 373         EX_ST(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_8)! and store 4 bytes
 374         add     %o1, 8, %o1             ! increase src ptr by 8
 375         EX_LD(LOAD(ld, %o1-4, %o3), memcpy_retl_o2_plus_4)      ! load 4 bytes
 376         add     %o0, 8, %o0             ! increase dst ptr by 8
 377         EX_ST(STORE(stw, %o3, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
 378         bz,pt   %xcc, .Lsmallx  ! exit if finished
 379 .Lmedw7:                                ! count is ge 1, less than 8
 380         cmp     %o2, 4                  ! check for 4 bytes left
 381         blt,pn  %xcc, .Lsmallleft3      ! skip if 3 or fewer bytes left
 382          nop                            !
 383         EX_LD(LOAD(ld, %o1, %o4), memcpy_retl_o2)       ! load 4 bytes
 384         add     %o1, 4, %o1             ! increase src ptr by 4
 385         add     %o0, 4, %o0             ! increase dst ptr by 4
 386         subcc   %o2, 4, %o2             ! decrease count by 4
 387         bnz     .Lsmallleft3
 388          EX_ST(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_4)! and store 4 bytes
 389         retl
 390          mov    EX_RETVAL(%g1), %o0
 391 
 392         .align 16
 393 .Llarge_align8_copy:                    ! Src and dst share 8 byte alignment
 394         ! align dst to 64 byte boundary
 395         andcc   %o0, 0x3f, %o3          ! %o3 == 0 means dst is 64 byte aligned
 396         brz,pn  %o3, .Laligned_to_64
 397          andcc  %o0, 8, %o3             ! odd long words to move?
 398         brz,pt  %o3, .Laligned_to_16
 399          nop
 400         EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
 401         sub     %o2, 8, %o2
 402         add     %o1, 8, %o1             ! increment src ptr
 403         add     %o0, 8, %o0             ! increment dst ptr
 404         EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
 405 .Laligned_to_16:
 406         andcc   %o0, 16, %o3            ! pair of long words to move?
 407         brz,pt  %o3, .Laligned_to_32
 408          nop
 409         EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
 410         sub     %o2, 16, %o2
 411         EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_16)
 412         add     %o1, 16, %o1            ! increment src ptr
 413         EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
 414         add     %o0, 16, %o0            ! increment dst ptr
 415         EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
 416 .Laligned_to_32:
 417         andcc   %o0, 32, %o3            ! four long words to move?
 418         brz,pt  %o3, .Laligned_to_64
 419          nop
 420         EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2)
 421         sub     %o2, 32, %o2
 422         EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_32)
 423         EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_24)
 424         EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_24)
 425         EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_16)
 426         EX_ST(STORE(stx, %o4, %o0+16), memcpy_retl_o2_plus_16)
 427         add     %o1, 32, %o1            ! increment src ptr
 428         EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_8)
 429         add     %o0, 32, %o0            ! increment dst ptr
 430         EX_ST(STORE(stx, %o4, %o0-8), memcpy_retl_o2_plus_8)
 431 .Laligned_to_64:
 432 !
 433 !       Using block init store (BIS) instructions to avoid fetching cache
 434 !       lines from memory. Use ST_CHUNK stores to first element of each cache
 435 !       line (similar to prefetching) to avoid overfilling STQ or miss buffers.
 436 !       Gives existing cache lines time to be moved out of L1/L2/L3 cache.
 437 !       Initial stores using MRU version of BIS to keep cache line in
 438 !       cache until we are ready to store final element of cache line.
 439 !       Then store last element using the LRU version of BIS.
 440 !
 441         andn    %o2, 0x3f, %o5          ! %o5 is multiple of block size
 442         and     %o2, 0x3f, %o2          ! residue bytes in %o2
 443 !
 444 !       We use STORE_MRU_ASI for the first seven stores to each cache line
 445 !       followed by STORE_ASI (mark as LRU) for the last store. That
 446 !       mixed approach reduces the probability that the cache line is removed
 447 !       before we finish setting it, while minimizing the effects on
 448 !       other cached values during a large memcpy
 449 !
 450 !       ST_CHUNK batches up initial BIS operations for several cache lines
 451 !       to allow multiple requests to not be blocked by overflowing the
 452 !       the store miss buffer. Then the matching stores for all those
 453 !       BIS operations are executed.
 454 !
 455 
 456         sub     %o0, 8, %o0             ! adjust %o0 for ASI alignment
 457 .Lalign_loop:
 458         cmp     %o5, ST_CHUNK*64
 459         blu,pt  %xcc, .Lalign_loop_fin
 460          mov    ST_CHUNK,%o3
 461 .Lalign_loop_start:
 462         prefetch [%o1 + (ALIGN_PRE * BLOCK_SIZE)], 21
 463         subcc   %o3, 1, %o3
 464         EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
 465         add     %o1, 64, %o1
 466         add     %o0, 8, %o0
 467         EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
 468         bgu     %xcc,.Lalign_loop_start
 469          add    %o0, 56, %o0
 470 
 471         mov     ST_CHUNK,%o3
 472         sllx    %o3, 6, %o4             ! ST_CHUNK*64
 473         sub     %o1, %o4, %o1           ! reset %o1
 474         sub     %o0, %o4, %o0           ! reset %o0
 475 
 476 .Lalign_loop_rest:
 477         EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
 478         add     %o0, 16, %o0
 479         EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
 480         EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
 481         add     %o0, 8, %o0
 482         EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
 483         subcc   %o3, 1, %o3
 484         EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5)
 485         add     %o0, 8, %o0
 486         EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
 487         EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5)
 488         add     %o0, 8, %o0
 489         EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
 490         EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5)
 491         add     %o0, 8, %o0
 492         EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
 493         EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5)
 494         add     %o1, 64, %o1
 495         add     %o0, 8, %o0
 496         EX_ST(STORE_INIT_MRU(%o4, %o0), memcpy_retl_o2_plus_o5)
 497         add     %o0, 8, %o0
 498         EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5)
 499         sub     %o5, 64, %o5
 500         bgu     %xcc,.Lalign_loop_rest
 501         ! mark cache line as LRU
 502          EX_ST(STORE_INIT(%o4, %o0), memcpy_retl_o2_plus_o5_plus_64)
 503 
 504         cmp     %o5, ST_CHUNK*64
 505         bgu,pt  %xcc, .Lalign_loop_start
 506          mov    ST_CHUNK,%o3
 507 
 508         cmp     %o5, 0
 509         beq     .Lalign_done
 510          nop
 511 .Lalign_loop_fin:
 512         EX_LD(LOAD(ldx, %o1, %o4), memcpy_retl_o2_plus_o5)
 513         EX_ST(STORE(stx, %o4, %o0+8), memcpy_retl_o2_plus_o5)
 514         EX_LD(LOAD(ldx, %o1+8, %o4), memcpy_retl_o2_plus_o5)
 515         EX_ST(STORE(stx, %o4, %o0+8+8), memcpy_retl_o2_plus_o5)
 516         EX_LD(LOAD(ldx, %o1+16, %o4), memcpy_retl_o2_plus_o5)
 517         EX_ST(STORE(stx, %o4, %o0+8+16), memcpy_retl_o2_plus_o5)
 518         subcc   %o5, 64, %o5
 519         EX_LD(LOAD(ldx, %o1+24, %o4), memcpy_retl_o2_plus_o5_64)
 520         EX_ST(STORE(stx, %o4, %o0+8+24), memcpy_retl_o2_plus_o5_64)
 521         EX_LD(LOAD(ldx, %o1+32, %o4), memcpy_retl_o2_plus_o5_64)
 522         EX_ST(STORE(stx, %o4, %o0+8+32), memcpy_retl_o2_plus_o5_64)
 523         EX_LD(LOAD(ldx, %o1+40, %o4), memcpy_retl_o2_plus_o5_64)
 524         EX_ST(STORE(stx, %o4, %o0+8+40), memcpy_retl_o2_plus_o5_64)
 525         EX_LD(LOAD(ldx, %o1+48, %o4), memcpy_retl_o2_plus_o5_64)
 526         add     %o1, 64, %o1
 527         EX_ST(STORE(stx, %o4, %o0+8+48), memcpy_retl_o2_plus_o5_64)
 528         add     %o0, 64, %o0
 529         EX_LD(LOAD(ldx, %o1-8, %o4), memcpy_retl_o2_plus_o5_64)
 530         bgu     %xcc,.Lalign_loop_fin
 531          EX_ST(STORE(stx, %o4, %o0), memcpy_retl_o2_plus_o5_64)
 532 
 533 .Lalign_done:
 534         add     %o0, 8, %o0             ! restore %o0 from ASI alignment
 535         membar  #StoreStore
 536         sub     %o2, 63, %o2            ! adjust length to allow cc test
 537         ba      .Lmedl63                ! in .Lmedl63
 538          nop
 539 
 540         .align 16
 541         ! Dst is on 8 byte boundary; src is not; remaining count > SMALL_MAX
 542 .Lunalignsetup:
 543 .Lunalignrejoin:
 544         mov     %g1, %o3        ! save %g1 as VISEntryHalf clobbers it
 545 #ifdef NON_USER_COPY
 546         VISEntryHalfFast(.Lmedium_vis_entry_fail_cp)
 547 #else
 548         VISEntryHalf
 549 #endif
 550         mov     %o3, %g1        ! restore %g1
 551 
 552         set MED_UMAX, %o3
 553         cmp %o2, %o3            ! check for.Lmedium unaligned limit
 554         bge,pt  %xcc,.Lunalign_large
 555          prefetch [%o1 + (4 * BLOCK_SIZE)], 20
 556         andn    %o2, 0x3f, %o5          ! %o5 is multiple of block size
 557         and     %o2, 0x3f, %o2          ! residue bytes in %o2
 558         cmp     %o2, 8                  ! Insure we do not load beyond
 559         bgt     .Lunalign_adjust        ! end of source buffer
 560          andn   %o1, 0x7, %o4           ! %o4 has long word aligned src address
 561         add     %o2, 64, %o2            ! adjust to leave loop
 562         sub     %o5, 64, %o5            ! early if necessary
 563 .Lunalign_adjust:
 564         alignaddr %o1, %g0, %g0         ! generate %gsr
 565         add     %o1, %o5, %o1           ! advance %o1 to after blocks
 566         EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)
 567 .Lunalign_loop:
 568         EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
 569         faligndata %f0, %f2, %f16
 570         EX_LD_FP(LOAD(ldd, %o4+16, %f4), memcpy_retl_o2_plus_o5)
 571         subcc   %o5, BLOCK_SIZE, %o5
 572         EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5_plus_64)
 573         faligndata %f2, %f4, %f18
 574         EX_LD_FP(LOAD(ldd, %o4+24, %f6), memcpy_retl_o2_plus_o5_plus_56)
 575         EX_ST_FP(STORE(std, %f18, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
 576         faligndata %f4, %f6, %f20
 577         EX_LD_FP(LOAD(ldd, %o4+32, %f8), memcpy_retl_o2_plus_o5_plus_48)
 578         EX_ST_FP(STORE(std, %f20, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
 579         faligndata %f6, %f8, %f22
 580         EX_LD_FP(LOAD(ldd, %o4+40, %f10), memcpy_retl_o2_plus_o5_plus_40)
 581         EX_ST_FP(STORE(std, %f22, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
 582         faligndata %f8, %f10, %f24
 583         EX_LD_FP(LOAD(ldd, %o4+48, %f12), memcpy_retl_o2_plus_o5_plus_32)
 584         EX_ST_FP(STORE(std, %f24, %o0+32), memcpy_retl_o2_plus_o5_plus_32)
 585         faligndata %f10, %f12, %f26
 586         EX_LD_FP(LOAD(ldd, %o4+56, %f14), memcpy_retl_o2_plus_o5_plus_24)
 587         add     %o4, BLOCK_SIZE, %o4
 588         EX_ST_FP(STORE(std, %f26, %o0+40), memcpy_retl_o2_plus_o5_plus_24)
 589         faligndata %f12, %f14, %f28
 590         EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5_plus_16)
 591         EX_ST_FP(STORE(std, %f28, %o0+48), memcpy_retl_o2_plus_o5_plus_16)
 592         faligndata %f14, %f0, %f30
 593         EX_ST_FP(STORE(std, %f30, %o0+56), memcpy_retl_o2_plus_o5_plus_8)
 594         add     %o0, BLOCK_SIZE, %o0
 595         bgu,pt  %xcc, .Lunalign_loop
 596          prefetch [%o4 + (5 * BLOCK_SIZE)], 20
 597         ba      .Lunalign_done
 598          nop
 599 
 600 .Lunalign_large:
 601         andcc   %o0, 0x3f, %o3          ! is dst 64-byte block aligned?
 602         bz      %xcc, .Lunalignsrc
 603          sub    %o3, 64, %o3            ! %o3 will be multiple of 8
 604         neg     %o3                     ! bytes until dest is 64 byte aligned
 605         sub     %o2, %o3, %o2           ! update cnt with bytes to be moved
 606         ! Move bytes according to source alignment
 607         andcc   %o1, 0x1, %o5
 608         bnz     %xcc, .Lunalignbyte     ! check for byte alignment
 609          nop
 610         andcc   %o1, 2, %o5             ! check for half word alignment
 611         bnz     %xcc, .Lunalignhalf
 612          nop
 613         ! Src is word aligned
 614 .Lunalignword:
 615         EX_LD_FP(LOAD(ld, %o1, %o4), memcpy_retl_o2_plus_o3)    ! load 4 bytes
 616         add     %o1, 8, %o1             ! increase src ptr by 8
 617         EX_ST_FP(STORE(stw, %o4, %o0), memcpy_retl_o2_plus_o3)  ! and store 4
 618         subcc   %o3, 8, %o3             ! decrease count by 8
 619         EX_LD_FP(LOAD(ld, %o1-4, %o4), memcpy_retl_o2_plus_o3_plus_4)! load 4
 620         add     %o0, 8, %o0             ! increase dst ptr by 8
 621         bnz     %xcc, .Lunalignword
 622          EX_ST_FP(STORE(stw, %o4, %o0-4), memcpy_retl_o2_plus_o3_plus_4)
 623         ba      .Lunalignsrc
 624          nop
 625 
 626         ! Src is half-word aligned
 627 .Lunalignhalf:
 628         EX_LD_FP(LOAD(lduh, %o1, %o4), memcpy_retl_o2_plus_o3)  ! load 2 bytes
 629         sllx    %o4, 32, %o5            ! shift left
 630         EX_LD_FP(LOAD(lduw, %o1+2, %o4), memcpy_retl_o2_plus_o3)
 631         or      %o4, %o5, %o5
 632         sllx    %o5, 16, %o5
 633         EX_LD_FP(LOAD(lduh, %o1+6, %o4), memcpy_retl_o2_plus_o3)
 634         or      %o4, %o5, %o5
 635         EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
 636         add     %o1, 8, %o1
 637         subcc   %o3, 8, %o3
 638         bnz     %xcc, .Lunalignhalf
 639          add    %o0, 8, %o0
 640         ba      .Lunalignsrc
 641          nop
 642 
 643         ! Src is Byte aligned
 644 .Lunalignbyte:
 645         sub     %o0, %o1, %o0           ! share pointer advance
 646 .Lunalignbyte_loop:
 647         EX_LD_FP(LOAD(ldub, %o1, %o4), memcpy_retl_o2_plus_o3)
 648         sllx    %o4, 56, %o5
 649         EX_LD_FP(LOAD(lduh, %o1+1, %o4), memcpy_retl_o2_plus_o3)
 650         sllx    %o4, 40, %o4
 651         or      %o4, %o5, %o5
 652         EX_LD_FP(LOAD(lduh, %o1+3, %o4), memcpy_retl_o2_plus_o3)
 653         sllx    %o4, 24, %o4
 654         or      %o4, %o5, %o5
 655         EX_LD_FP(LOAD(lduh, %o1+5, %o4), memcpy_retl_o2_plus_o3)
 656         sllx    %o4,  8, %o4
 657         or      %o4, %o5, %o5
 658         EX_LD_FP(LOAD(ldub, %o1+7, %o4), memcpy_retl_o2_plus_o3)
 659         or      %o4, %o5, %o5
 660         add     %o0, %o1, %o0
 661         EX_ST_FP(STORE(stx, %o5, %o0), memcpy_retl_o2_plus_o3)
 662         sub     %o0, %o1, %o0
 663         subcc   %o3, 8, %o3
 664         bnz     %xcc, .Lunalignbyte_loop
 665          add    %o1, 8, %o1
 666         add     %o0,%o1, %o0            ! restore pointer
 667 
 668         ! Destination is now block (64 byte aligned)
 669 .Lunalignsrc:
 670         andn    %o2, 0x3f, %o5          ! %o5 is multiple of block size
 671         and     %o2, 0x3f, %o2          ! residue bytes in %o2
 672         add     %o2, 64, %o2            ! Insure we do not load beyond
 673         sub     %o5, 64, %o5            ! end of source buffer
 674 
 675         andn    %o1, 0x7, %o4           ! %o4 has long word aligned src address
 676         alignaddr %o1, %g0, %g0         ! generate %gsr
 677         add     %o1, %o5, %o1           ! advance %o1 to after blocks
 678 
 679         EX_LD_FP(LOAD(ldd, %o4, %f14), memcpy_retl_o2_plus_o5)
 680         add     %o4, 8, %o4
 681 .Lunalign_sloop:
 682         EX_LD_FP(LOAD(ldd, %o4, %f16), memcpy_retl_o2_plus_o5)
 683         faligndata %f14, %f16, %f0
 684         EX_LD_FP(LOAD(ldd, %o4+8, %f18), memcpy_retl_o2_plus_o5)
 685         faligndata %f16, %f18, %f2
 686         EX_LD_FP(LOAD(ldd, %o4+16, %f20), memcpy_retl_o2_plus_o5)
 687         faligndata %f18, %f20, %f4
 688         EX_ST_FP(STORE(std, %f0, %o0), memcpy_retl_o2_plus_o5)
 689         subcc   %o5, 64, %o5
 690         EX_LD_FP(LOAD(ldd, %o4+24, %f22), memcpy_retl_o2_plus_o5_plus_56)
 691         faligndata %f20, %f22, %f6
 692         EX_ST_FP(STORE(std, %f2, %o0+8), memcpy_retl_o2_plus_o5_plus_56)
 693         EX_LD_FP(LOAD(ldd, %o4+32, %f24), memcpy_retl_o2_plus_o5_plus_48)
 694         faligndata %f22, %f24, %f8
 695         EX_ST_FP(STORE(std, %f4, %o0+16), memcpy_retl_o2_plus_o5_plus_48)
 696         EX_LD_FP(LOAD(ldd, %o4+40, %f26), memcpy_retl_o2_plus_o5_plus_40)
 697         faligndata %f24, %f26, %f10
 698         EX_ST_FP(STORE(std, %f6, %o0+24), memcpy_retl_o2_plus_o5_plus_40)
 699         EX_LD_FP(LOAD(ldd, %o4+48, %f28), memcpy_retl_o2_plus_o5_plus_40)
 700         faligndata %f26, %f28, %f12
 701         EX_ST_FP(STORE(std, %f8, %o0+32), memcpy_retl_o2_plus_o5_plus_40)
 702         add     %o4, 64, %o4
 703         EX_LD_FP(LOAD(ldd, %o4-8, %f30), memcpy_retl_o2_plus_o5_plus_40)
 704         faligndata %f28, %f30, %f14
 705         EX_ST_FP(STORE(std, %f10, %o0+40), memcpy_retl_o2_plus_o5_plus_40)
 706         EX_ST_FP(STORE(std, %f12, %o0+48), memcpy_retl_o2_plus_o5_plus_40)
 707         add     %o0, 64, %o0
 708         EX_ST_FP(STORE(std, %f14, %o0-8), memcpy_retl_o2_plus_o5_plus_40)
 709         fsrc2   %f30, %f14
 710         bgu,pt  %xcc, .Lunalign_sloop
 711          prefetch [%o4 + (8 * BLOCK_SIZE)], 20
 712 
 713 .Lunalign_done:
 714         ! Handle trailing bytes, 64 to 127
 715         ! Dest long word aligned, Src not long word aligned
 716         cmp     %o2, 15
 717         bleu    %xcc, .Lunalign_short
 718 
 719          andn   %o2, 0x7, %o5           ! %o5 is multiple of 8
 720         and     %o2, 0x7, %o2           ! residue bytes in %o2
 721         add     %o2, 8, %o2
 722         sub     %o5, 8, %o5             ! insure we do not load past end of src
 723         andn    %o1, 0x7, %o4           ! %o4 has long word aligned src address
 724         add     %o1, %o5, %o1           ! advance %o1 to after multiple of 8
 725         EX_LD_FP(LOAD(ldd, %o4, %f0), memcpy_retl_o2_plus_o5)! fetch partialword
 726 .Lunalign_by8:
 727         EX_LD_FP(LOAD(ldd, %o4+8, %f2), memcpy_retl_o2_plus_o5)
 728         add     %o4, 8, %o4
 729         faligndata %f0, %f2, %f16
 730         subcc   %o5, 8, %o5
 731         EX_ST_FP(STORE(std, %f16, %o0), memcpy_retl_o2_plus_o5)
 732         fsrc2   %f2, %f0
 733         bgu,pt  %xcc, .Lunalign_by8
 734          add    %o0, 8, %o0
 735 
 736 .Lunalign_short:
 737 #ifdef NON_USER_COPY
 738         VISExitHalfFast
 739 #else
 740         VISExitHalf
 741 #endif
 742         ba      .Lsmallrest
 743          nop
 744 
 745 /*
 746  * This is a special case of nested memcpy. This can happen when kernel
 747  * calls unaligned memcpy back to back without saving FP registers. We need
 748  * traps(context switch) to save/restore FP registers. If the kernel calls
 749  * memcpy without this trap sequence we will hit FP corruption. Let's use
 750  * the normal integer load/store method in this case.
 751  */
 752 
 753 #ifdef NON_USER_COPY
 754 .Lmedium_vis_entry_fail_cp:
 755         or      %o0, %o1, %g2
 756 #endif
 757 .Lmedium_cp:
 758         LOAD(prefetch, %o1 + 0x40, #n_reads_strong)
 759         andcc   %g2, 0x7, %g0
 760         bne,pn  %xcc, .Lmedium_unaligned_cp
 761          nop
 762 
 763 .Lmedium_noprefetch_cp:
 764         andncc  %o2, 0x20 - 1, %o5
 765         be,pn   %xcc, 2f
 766          sub    %o2, %o5, %o2
 767 1:      EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
 768         EX_LD(LOAD(ldx, %o1 + 0x08, %g2), memcpy_retl_o2_plus_o5)
 769         EX_LD(LOAD(ldx, %o1 + 0x10, %g7), memcpy_retl_o2_plus_o5)
 770         EX_LD(LOAD(ldx, %o1 + 0x18, %o4), memcpy_retl_o2_plus_o5)
 771         add     %o1, 0x20, %o1
 772         subcc   %o5, 0x20, %o5
 773         EX_ST(STORE(stx, %o3, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_32)
 774         EX_ST(STORE(stx, %g2, %o0 + 0x08), memcpy_retl_o2_plus_o5_plus_24)
 775         EX_ST(STORE(stx, %g7, %o0 + 0x10), memcpy_retl_o2_plus_o5_plus_24)
 776         EX_ST(STORE(stx, %o4, %o0 + 0x18), memcpy_retl_o2_plus_o5_plus_8)
 777         bne,pt  %xcc, 1b
 778          add    %o0, 0x20, %o0
 779 2:      andcc   %o2, 0x18, %o5
 780         be,pt   %xcc, 3f
 781          sub    %o2, %o5, %o2
 782 1:      EX_LD(LOAD(ldx, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
 783         add     %o1, 0x08, %o1
 784         add     %o0, 0x08, %o0
 785         subcc   %o5, 0x08, %o5
 786         bne,pt  %xcc, 1b
 787          EX_ST(STORE(stx, %o3, %o0 - 0x08), memcpy_retl_o2_plus_o5_plus_8)
 788 3:      brz,pt  %o2, .Lexit_cp
 789          cmp    %o2, 0x04
 790         bl,pn   %xcc, .Ltiny_cp
 791          nop
 792         EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2)
 793         add     %o1, 0x04, %o1
 794         add     %o0, 0x04, %o0
 795         subcc   %o2, 0x04, %o2
 796         bne,pn  %xcc, .Ltiny_cp
 797          EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_4)
 798         ba,a,pt %xcc, .Lexit_cp
 799 
 800 .Lmedium_unaligned_cp:
 801         /* First get dest 8 byte aligned.  */
 802         sub     %g0, %o0, %o3
 803         and     %o3, 0x7, %o3
 804         brz,pt  %o3, 2f
 805          sub    %o2, %o3, %o2
 806 
 807 1:      EX_LD(LOAD(ldub, %o1 + 0x00, %g2), memcpy_retl_o2_plus_g1)
 808         add     %o1, 1, %o1
 809         subcc   %o3, 1, %o3
 810         add     %o0, 1, %o0
 811         bne,pt  %xcc, 1b
 812          EX_ST(STORE(stb, %g2, %o0 - 0x01), memcpy_retl_o2_plus_g1_plus_1)
 813 2:
 814         and     %o1, 0x7, %o3
 815         brz,pn  %o3, .Lmedium_noprefetch_cp
 816          sll    %o3, 3, %o3
 817         mov     64, %g2
 818         sub     %g2, %o3, %g2
 819         andn    %o1, 0x7, %o1
 820         EX_LD(LOAD(ldx, %o1 + 0x00, %o4), memcpy_retl_o2)
 821         sllx    %o4, %o3, %o4
 822         andn    %o2, 0x08 - 1, %o5
 823         sub     %o2, %o5, %o2
 824 
 825 1:      EX_LD(LOAD(ldx, %o1 + 0x08, %g3), memcpy_retl_o2_plus_o5)
 826         add     %o1, 0x08, %o1
 827         subcc   %o5, 0x08, %o5
 828         srlx    %g3, %g2, %g7
 829         or      %g7, %o4, %g7
 830         EX_ST(STORE(stx, %g7, %o0 + 0x00), memcpy_retl_o2_plus_o5_plus_8)
 831         add     %o0, 0x08, %o0
 832         bne,pt  %xcc, 1b
 833          sllx   %g3, %o3, %o4
 834         srl     %o3, 3, %o3
 835         add     %o1, %o3, %o1
 836         brz,pn  %o2, .Lexit_cp
 837          nop
 838         ba,pt   %xcc, .Lsmall_unaligned_cp
 839 
 840 .Ltiny_cp:
 841         EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
 842         subcc   %o2, 1, %o2
 843         be,pn   %xcc, .Lexit_cp
 844          EX_ST(STORE(stb, %o3, %o0 + 0x00), memcpy_retl_o2_plus_1)
 845         EX_LD(LOAD(ldub, %o1 + 0x01, %o3), memcpy_retl_o2)
 846         subcc   %o2, 1, %o2
 847         be,pn   %xcc, .Lexit_cp
 848          EX_ST(STORE(stb, %o3, %o0 + 0x01), memcpy_retl_o2_plus_1)
 849         EX_LD(LOAD(ldub, %o1 + 0x02, %o3), memcpy_retl_o2)
 850         ba,pt   %xcc, .Lexit_cp
 851          EX_ST(STORE(stb, %o3, %o0 + 0x02), memcpy_retl_o2)
 852 
 853 .Lsmall_cp:
 854         andcc   %g2, 0x3, %g0
 855         bne,pn  %xcc, .Lsmall_unaligned_cp
 856          andn   %o2, 0x4 - 1, %o5
 857         sub     %o2, %o5, %o2
 858 1:
 859         EX_LD(LOAD(lduw, %o1 + 0x00, %o3), memcpy_retl_o2_plus_o5)
 860         add     %o1, 0x04, %o1
 861         subcc   %o5, 0x04, %o5
 862         add     %o0, 0x04, %o0
 863         bne,pt  %xcc, 1b
 864          EX_ST(STORE(stw, %o3, %o0 - 0x04), memcpy_retl_o2_plus_o5_plus_4)
 865         brz,pt  %o2, .Lexit_cp
 866          nop
 867         ba,a,pt %xcc, .Ltiny_cp
 868 
 869 .Lsmall_unaligned_cp:
 870 1:      EX_LD(LOAD(ldub, %o1 + 0x00, %o3), memcpy_retl_o2)
 871         add     %o1, 1, %o1
 872         add     %o0, 1, %o0
 873         subcc   %o2, 1, %o2
 874         bne,pt  %xcc, 1b
 875          EX_ST(STORE(stb, %o3, %o0 - 0x01), memcpy_retl_o2_plus_1)
 876         ba,a,pt %xcc, .Lexit_cp
 877 
 878 .Lsmallrest:
 879         tst     %o2
 880         bz,pt   %xcc, .Lsmallx
 881          cmp    %o2, 4
 882         blt,pn  %xcc, .Lsmallleft3
 883          nop
 884         sub     %o2, 3, %o2
 885 .Lsmallnotalign4:
 886         EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_3)! read byte
 887         subcc   %o2, 4, %o2             ! reduce count by 4
 888         EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_7)! write byte & repeat
 889         EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2_plus_6)! for total of 4
 890         add     %o1, 4, %o1             ! advance SRC by 4
 891         EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_6)
 892         EX_LD(LOAD(ldub, %o1-2, %o3), memcpy_retl_o2_plus_5)
 893         add     %o0, 4, %o0             ! advance DST by 4
 894         EX_ST(STORE(stb, %o3, %o0-2), memcpy_retl_o2_plus_5)
 895         EX_LD(LOAD(ldub, %o1-1, %o3), memcpy_retl_o2_plus_4)
 896         bgu,pt  %xcc, .Lsmallnotalign4  ! loop til 3 or fewer bytes remain
 897         EX_ST(STORE(stb, %o3, %o0-1), memcpy_retl_o2_plus_4)
 898         addcc   %o2, 3, %o2             ! restore count
 899         bz,pt   %xcc, .Lsmallx
 900 .Lsmallleft3:                           ! 1, 2, or 3 bytes remain
 901         subcc   %o2, 1, %o2
 902         EX_LD(LOAD(ldub, %o1, %o3), memcpy_retl_o2_plus_1)      ! load one byte
 903         bz,pt   %xcc, .Lsmallx
 904         EX_ST(STORE(stb, %o3, %o0), memcpy_retl_o2_plus_1)      ! store one byte
 905         EX_LD(LOAD(ldub, %o1+1, %o3), memcpy_retl_o2)   ! load second byte
 906         subcc   %o2, 1, %o2
 907         bz,pt   %xcc, .Lsmallx
 908         EX_ST(STORE(stb, %o3, %o0+1), memcpy_retl_o2_plus_1)! store second byte
 909         EX_LD(LOAD(ldub, %o1+2, %o3), memcpy_retl_o2)   ! load third byte
 910         EX_ST(STORE(stb, %o3, %o0+2), memcpy_retl_o2)   ! store third byte
 911 .Lsmallx:
 912         retl
 913          mov    EX_RETVAL(%g1), %o0
 914 .Lsmallfin:
 915         tst     %o2
 916         bnz,pn  %xcc, .Lsmallleft3
 917          nop
 918         retl
 919          mov    EX_RETVAL(%g1), %o0     ! restore %o0
 920 .Lexit_cp:
 921         retl
 922          mov    EX_RETVAL(%g1), %o0
 923         .size  FUNC_NAME, .-FUNC_NAME

/* [<][>][^][v][top][bottom][index][help] */