memcpy多字节字节赋值问题
以前没注意过多字节赋值问题,考虑的角度是充分利用 cpu 的 32 位带宽。一次复制 1 个字节和一次复制 4 个字节占用的 cpu 指令周期是一样的,既然我们的 cpu 能同时操作 32 位,为什么我们不能一次复制 4 个字节呢?如此一来,总运行的指令数将降低到原来的 1/4 !
于是有下面代码(henix转自http://www.embedded.com/columns/technicalinsights/19205567?_requestid=212290):
文章图片
文章图片
代码
1 void * memcpy( void * dest, void * src, size_t count)
2 {
3 size_t c = count >> 2 ; /* 相当于 count / 4 */
4 long * pt;
5 const long * ps;
6 if (c)
7 {
8 /* 首先按 4 个字节复制 */
9 pt = ( long * ) dest;
10 ps = ( const long * ) src;
11 while (c)
12 {
13 * pt = * ps;
14 pt ++ ;
15 ps ++ ;
16 c -- ;
17 }
18 }
19 c = count & 3 ; /* 得到 count 模 4 的余数 */
20 if (c)
21 {
22 /* 如果有剩下的,再按 1 个字节复制剩余的 */
23 char * pct = ( char * ) pt;
24 const char * pcs = ( const char * ) ps;
25 while (c)
26 {
27 * pct = * pcs;
28 pct ++ ;
29 pcs ++ ;
30 c -- ;
31 }
32 }
33 return dest;
34 } 通过使用 long 型指针,强行按 4 个字节复制。
还有对 3 种 memcpy 实现的对比,上述算法受到地址对齐的影响较为严重。在地址按4 字节对齐的时候,上述算法的效率比单字节 memcpy 实现高很多,但如果地址没有按4 字节对齐,则其效率并不高,有时甚至还比普通 memcpy 还低。这可能是因为,虽然上述算法减少了 cpu 的指令数,但内存的速度比 cpu 慢得多,速度的瓶颈还是在内存。
【memcpy多字节字节赋值问题】刚看了一份memcpy的实现,为了解决字节对齐的问题,从128开始,依次减少,直到1个字节。原文在http://code.google.com/p/dingoo-sdk/source/browse/trunk/dingoo_sdk/src/libc/memcpy.c?spec=svn180&r=180
贴出如下:
#include "string.h"
#include inline void _memcpy_1x1(void* dst, const void* src) {
asm(
"lb $t0, 0(%0)\n\t"
"sb $t0, 0(%1)\n\t"
: : "r"(dst), "r"(src)
);
} inline void _memcpy_2x1(void* dst, const void* src) {
asm(
"lh $t0, 0(%0)\n\t"
"sh $t0, 0(%1)\n\t"
: : "r"(dst), "r"(src)
);
} inline void _memcpy_4x1(void* dst, const void* src) {
asm(
"lw $t0, 0(%0)\n\t"
"sw $t0, 0(%1)\n\t"
: : "r"(dst), "r"(src)
);
} inline void _memcpy_4x2(void* dst, const void* src) {
asm(
"lw $t0, 0(%0)\n\t"
"lw $t1, 4(%0)\n\t"
"sw $t0, 0(%1)\n\t"
"sw $t1, 4(%1)\n\t"
: : "r"(dst), "r"(src)
);
} inline void _memcpy_4x4(void* dst, const void* src) {
asm(
"lw $t0,0(%0)\n\t"
"lw $t1,4(%0)\n\t"
"lw $t2,8(%0)\n\t"
"lw $t3, 12(%0)\n\t"
"sw $t0,0(%1)\n\t"
"sw $t1,4(%1)\n\t"
"sw $t2,8(%1)\n\t"
"sw $t3, 12(%1)\n\t"
: : "r"(dst), "r"(src)
);
} inline void _memcpy_4x8(void* dst, const void* src) {
asm(
"lw $t0,0(%0)\n\t"
"lw $t1,4(%0)\n\t"
"lw $t2,8(%0)\n\t"
"lw $t3, 12(%0)\n\t"
"lw $t4, 16(%0)\n\t"
"lw $t5, 20(%0)\n\t"
"lw $t6, 24(%0)\n\t"
"lw $t7, 28(%0)\n\t"
"sw $t0,0(%1)\n\t"
"sw $t1,4(%1)\n\t"
"sw $t2,8(%1)\n\t"
"sw $t3, 12(%1)\n\t"
"sw $t4, 16(%1)\n\t"
"sw $t5, 20(%1)\n\t"
"sw $t6, 24(%1)\n\t"
"sw $t7, 28(%1)\n\t"
: : "r"(dst), "r"(src)
);
} inline void _memcpy_4x16(void* dst, const void* src) {
asm(
"lw $t0,0(%0)\n\t"
"lw $t1,4(%0)\n\t"
"lw $t2,8(%0)\n\t"
"lw $t3, 12(%0)\n\t"
"lw $t4, 16(%0)\n\t"
"lw $t5, 20(%0)\n\t"
"lw $t6, 24(%0)\n\t"
"lw $t7, 28(%0)\n\t"
"lw $t8, 32(%0)\n\t"
"lw $t9, 36(%0)\n\t" "sw $t0,0(%1)\n\t"
"lw $t0, 40(%0)\n\t"
"sw $t1,4(%1)\n\t"
"lw $t1, 44(%0)\n\t"
"sw $t2,8(%1)\n\t"
"lw $t2, 48(%0)\n\t"
"sw $t3, 12(%1)\n\t"
"lw $t3, 52(%0)\n\t"
"sw $t4, 16(%1)\n\t"
"lw $t4, 56(%0)\n\t"
"sw $t5, 20(%1)\n\t"
"lw $t5, 60(%0)\n\t" "sw $t6, 24(%1)\n\t"
"sw $t7, 28(%1)\n\t"
"sw $t8, 32(%1)\n\t"
"sw $t9, 36(%1)\n\t" "sw $t0, 40(%1)\n\t"
"sw $t1, 44(%1)\n\t"
"sw $t2, 48(%1)\n\t"
"sw $t3, 52(%1)\n\t"
"sw $t4, 56(%1)\n\t"
"sw $t5, 60(%1)\n\t"
: : "r"(dst), "r"(src)
);
} inline void _memcpy_4x32(void* dst, const void* src) {
asm(
"lw $t0,0(%0)\n\t"
"lw $t1,4(%0)\n\t"
"lw $t2,8(%0)\n\t"
"lw $t3,12(%0)\n\t"
"lw $t4,16(%0)\n\t"
"lw $t5,20(%0)\n\t"
"lw $t6,24(%0)\n\t"
"lw $t7,28(%0)\n\t"
"lw $t8,32(%0)\n\t"
"lw $t9,36(%0)\n\t" "sw $t0,0(%1)\n\t"
"lw $t0,40(%0)\n\t"
"sw $t1,4(%1)\n\t"
"lw $t1,44(%0)\n\t"
"sw $t2,8(%1)\n\t"
"lw $t2,48(%0)\n\t"
"sw $t3,12(%1)\n\t"
"lw $t3,52(%0)\n\t"
"sw $t4,16(%1)\n\t"
"lw $t4,56(%0)\n\t"
"sw $t5,20(%1)\n\t"
"lw $t5,60(%0)\n\t"
"sw $t6,24(%1)\n\t"
"lw $t6,64(%0)\n\t"
"sw $t7,28(%1)\n\t"
"lw $t7,68(%0)\n\t"
"sw $t8,32(%1)\n\t"
"lw $t8,72(%0)\n\t"
"sw $t9,36(%1)\n\t"
"lw $t9,76(%0)\n\t" "sw $t0,40(%1)\n\t"
"lw $t0,80(%0)\n\t"
"sw $t1,44(%1)\n\t"
"lw $t1,84(%0)\n\t"
"sw $t2,48(%1)\n\t"
"lw $t2,88(%0)\n\t"
"sw $t3,52(%1)\n\t"
"lw $t3,92(%0)\n\t"
"sw $t4,56(%1)\n\t"
"lw $t4,96(%0)\n\t"
"sw $t5,60(%1)\n\t"
"lw $t5, 100(%0)\n\t"
"sw $t6,64(%1)\n\t"
"lw $t6, 104(%0)\n\t"
"sw $t7,68(%1)\n\t"
"lw $t7, 108(%0)\n\t"
"sw $t8,72(%1)\n\t"
"lw $t8, 112(%0)\n\t"
"sw $t9,76(%1)\n\t"
"lw $t9, 116(%0)\n\t" "sw $t0,80(%1)\n\t"
"lw $t0, 120(%0)\n\t"
"sw $t1,84(%1)\n\t"
"lw $t1, 124(%0)\n\t" "sw $t2,88(%1)\n\t"
"sw $t3,92(%1)\n\t"
"sw $t4,96(%1)\n\t"
"sw $t5, 100(%1)\n\t"
"sw $t6, 104(%1)\n\t"
"sw $t7, 108(%1)\n\t"
"sw $t8, 112(%1)\n\t"
"sw $t9, 116(%1)\n\t" "sw $t0, 120(%1)\n\t"
"sw $t1, 124(%1)\n\t"
: : "r"(dst), "r"(src)
);
} inline void _memcpy_down_1(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
if(((uintptr_t)dst_end - (uintptr_t)dst) >= 1) {
_memcpy_1x1(dst, src);
dst += 1;
src += 1;
}
} inline void _memcpy_down_2(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
if(((uintptr_t)dst_end - (uintptr_t)dst) >= 2) {
_memcpy_2x1(dst, src);
dst += 2;
src += 2;
}
_memcpy_down_1(dst, src, dst_end);
} inline void _memcpy_down_4(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
if(((uintptr_t)dst_end - (uintptr_t)dst) >= 4) {
_memcpy_4x1(dst, src);
dst += 4;
src += 4;
}
_memcpy_down_2(dst, src, dst_end);
} inline void _memcpy_down_8(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
if(((uintptr_t)dst_end - (uintptr_t)dst) >= 8) {
_memcpy_4x2(dst, src);
dst += 8;
src += 8;
}
_memcpy_down_4(dst, src, dst_end);
} inline void _memcpy_down_16(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
if(((uintptr_t)dst_end - (uintptr_t)dst) >= 16) {
_memcpy_4x4(dst, src);
dst += 16;
src += 16;
}
_memcpy_down_8(dst, src, dst_end);
} inline void _memcpy_down_32(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
if(((uintptr_t)dst_end - (uintptr_t)dst) >= 32) {
_memcpy_4x8(dst, src);
dst += 32;
src += 32;
}
_memcpy_down_16(dst, src, dst_end);
} inline void _memcpy_down_64(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
if(((uintptr_t)dst_end - (uintptr_t)dst) >= 64) {
_memcpy_4x16(dst, src);
dst += 64;
src += 64;
}
_memcpy_down_32(dst, src, dst_end);
} inline void _memcpy_down_128(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
if(((uintptr_t)dst_end - (uintptr_t)dst) >= 128) {
_memcpy_4x32(dst, src);
dst += 128;
src += 128;
}
_memcpy_down_64(dst, src, dst_end);
} inline void _memcpy_1(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
for(;
dst <= (dst_end - 1);
dst += 1, src += 1)
_memcpy_1x1(dst, src);
} inline void _memcpy_2(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
for(;
dst <= (dst_end - 2);
dst += 2, src += 2)
_memcpy_2x1(dst, src);
_memcpy_down_1(dst, src, dst_end);
} inline void _memcpy_4(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
for(;
dst <= (dst_end - 4);
dst += 4, src += 4)
_memcpy_4x1(dst, src);
_memcpy_down_2(dst, src, dst_end);
} inline void _memcpy_8(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
for(;
dst <= (dst_end - 8);
dst += 8, src += 8)
_memcpy_4x2(dst, src);
_memcpy_down_4(dst, src, dst_end);
} inline void _memcpy_16(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
for(;
dst <= (dst_end - 16);
dst += 16, src += 16)
_memcpy_4x4(dst, src);
_memcpy_down_8(dst, src, dst_end);
} inline void _memcpy_32(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
for(;
dst <= (dst_end - 32);
dst += 32, src += 32)
_memcpy_4x8(dst, src);
_memcpy_down_16(dst, src, dst_end);
} inline void _memcpy_64(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
for(;
dst <= (dst_end - 64);
dst += 64, src += 64)
_memcpy_4x16(dst, src);
_memcpy_down_32(dst, src, dst_end);
} inline void _memcpy_128(uint8_t* dst, const uint8_t* src, uint8_t* dst_end) {
for(;
dst <= (dst_end - 128);
dst += 128, src += 128)
_memcpy_4x32(dst, src);
_memcpy_down_64(dst, src, dst_end);
} void* _memcpy_fast(void* dst, const void* src, uintptr_t size) {
if((dst == NULL) || (src =https://www.it610.com/article/= NULL))
return NULL;
uint8_t*_dst = (uint8_t*)dst;
uint8_t*_dst_end = &_dst[size];
const uint8_t* _src = (const uint8_t*)src;
if((((uintptr_t)_dst & 1) != 0) && (((uintptr_t)_dst & 1) == ((uintptr_t)_src & 1)) && ((_dst + 1) <= _dst_end)) {
_memcpy_1x1(_dst, _src);
_dst += 1;
_src += 1;
}
if(((uintptr_t)_dst & 1) == 0) {
if((((uintptr_t)_dst & 3) != 0) && (((uintptr_t)_dst & 3) == ((uintptr_t)_src & 3)) && ((_dst + 2) <= _dst_end)) {
_memcpy_2x1(_dst, _src);
_dst += 2;
_src += 2;
}
if(((uintptr_t)_dst & 3) == 0) {
if((((uintptr_t)_dst & 7) != 0) && (((uintptr_t)_dst & 7) == ((uintptr_t)_src & 7)) && ((_dst + 4) <= _dst_end)) {
_memcpy_4x1(_dst, _src);
_dst += 4;
_src += 4;
}
if(((uintptr_t)_dst & 7) == 0) {
_memcpy_128(_dst, _src, _dst_end);
} else _memcpy_4(_dst, _src, _dst_end);
} else _memcpy_2(_dst, _src, _dst_end);
} else _memcpy_1(_dst, _src, _dst_end);
return dst;
}
转载于:https://www.cnblogs.com/shapherd/archive/2010/05/19/1739628.html
推荐阅读
- 放屁有这三个特征的,请注意啦!这说明你的身体毒素太多
- 爱就是希望你好好活着
- 昨夜小楼听风
- 知识
- 死结。
- 我从来不做坏事
- 烦恼和幸福
- 关于QueryWrapper|关于QueryWrapper,实现MybatisPlus多表关联查询方式
- Linux下面如何查看tomcat已经使用多少线程
- 说得清,说不清