요기가 일단 테스트를 올린 곳 이고요.
VisualStudio 2015
benchmark(size=32 bytes, times=16777216):
result(dst aligned, src aligned): memcpy_fast=38ms memcpy=19 ms
result(dst aligned, src unalign): memcpy_fast=40ms memcpy=24 ms
result(dst unalign, src aligned): memcpy_fast=38ms memcpy=30 ms
result(dst unalign, src unalign): memcpy_fast=40ms memcpy=28 ms
benchmark(size=64 bytes, times=16777216):
result(dst aligned, src aligned): memcpy_fast=39ms memcpy=42 ms
result(dst aligned, src unalign): memcpy_fast=38ms memcpy=49 ms
result(dst unalign, src aligned): memcpy_fast=42ms memcpy=46 ms
result(dst unalign, src unalign): memcpy_fast=39ms memcpy=51 ms
benchmark(size=512 bytes, times=8388608):
result(dst aligned, src aligned): memcpy_fast=54ms memcpy=101 ms
result(dst aligned, src unalign): memcpy_fast=72ms memcpy=103 ms
result(dst unalign, src aligned): memcpy_fast=45ms memcpy=105 ms
result(dst unalign, src unalign): memcpy_fast=55ms memcpy=102 ms
benchmark(size=1024 bytes, times=4194304):
result(dst aligned, src aligned): memcpy_fast=38ms memcpy=66 ms
result(dst aligned, src unalign): memcpy_fast=44ms memcpy=67 ms
result(dst unalign, src aligned): memcpy_fast=37ms memcpy=91 ms
result(dst unalign, src unalign): memcpy_fast=43ms memcpy=90 ms
benchmark(size=4096 bytes, times=524288):
result(dst aligned, src aligned): memcpy_fast=17ms memcpy=24 ms
result(dst aligned, src unalign): memcpy_fast=17ms memcpy=22 ms
result(dst unalign, src aligned): memcpy_fast=18ms memcpy=27 ms
result(dst unalign, src unalign): memcpy_fast=17ms memcpy=27 ms
benchmark(size=8192 bytes, times=262144):
result(dst aligned, src aligned): memcpy_fast=15ms memcpy=19 ms
result(dst aligned, src unalign): memcpy_fast=17ms memcpy=18 ms
result(dst unalign, src aligned): memcpy_fast=18ms memcpy=26 ms
result(dst unalign, src unalign): memcpy_fast=17ms memcpy=27 ms
benchmark(size=1048576 bytes, times=2048):
result(dst aligned, src aligned): memcpy_fast=74ms memcpy=61 ms
result(dst aligned, src unalign): memcpy_fast=70ms memcpy=62 ms
result(dst unalign, src aligned): memcpy_fast=74ms memcpy=65 ms
result(dst unalign, src unalign): memcpy_fast=74ms memcpy=65 ms
benchmark(size=4194304 bytes, times=512):
result(dst aligned, src aligned): memcpy_fast=69ms memcpy=67 ms
result(dst aligned, src unalign): memcpy_fast=68ms memcpy=76 ms
result(dst unalign, src aligned): memcpy_fast=76ms memcpy=74 ms
result(dst unalign, src unalign): memcpy_fast=71ms memcpy=74 ms
benchmark(size=8388608 bytes, times=256):
result(dst aligned, src aligned): memcpy_fast=105ms memcpy=176 ms
result(dst aligned, src unalign): memcpy_fast=83ms memcpy=146 ms
result(dst unalign, src aligned): memcpy_fast=86ms memcpy=144 ms
result(dst unalign, src unalign): memcpy_fast=78ms memcpy=131 ms
benchmark random access:
memcpy_fast=193ms memcpy=192ms
VisualStudio 2017
benchmark(size=32 bytes, times=16777216):
result(dst aligned, src aligned): memcpy_fast=31ms memcpy=22 ms
result(dst aligned, src unalign): memcpy_fast=36ms memcpy=23 ms
result(dst unalign, src aligned): memcpy_fast=37ms memcpy=27 ms
result(dst unalign, src unalign): memcpy_fast=36ms memcpy=32 ms
benchmark(size=64 bytes, times=16777216):
result(dst aligned, src aligned): memcpy_fast=33ms memcpy=61 ms
result(dst aligned, src unalign): memcpy_fast=33ms memcpy=53 ms
result(dst unalign, src aligned): memcpy_fast=33ms memcpy=51 ms
result(dst unalign, src unalign): memcpy_fast=35ms memcpy=67 ms
benchmark(size=512 bytes, times=8388608):
result(dst aligned, src aligned): memcpy_fast=45ms memcpy=101 ms
result(dst aligned, src unalign): memcpy_fast=44ms memcpy=101 ms
result(dst unalign, src aligned): memcpy_fast=42ms memcpy=102 ms
result(dst unalign, src unalign): memcpy_fast=43ms memcpy=100 ms
benchmark(size=1024 bytes, times=4194304):
result(dst aligned, src aligned): memcpy_fast=35ms memcpy=66 ms
result(dst aligned, src unalign): memcpy_fast=36ms memcpy=69 ms
result(dst unalign, src aligned): memcpy_fast=36ms memcpy=88 ms
result(dst unalign, src unalign): memcpy_fast=38ms memcpy=89 ms
benchmark(size=4096 bytes, times=524288):
result(dst aligned, src aligned): memcpy_fast=16ms memcpy=21 ms
result(dst aligned, src unalign): memcpy_fast=17ms memcpy=21 ms
result(dst unalign, src aligned): memcpy_fast=16ms memcpy=27 ms
result(dst unalign, src unalign): memcpy_fast=17ms memcpy=26 ms
benchmark(size=8192 bytes, times=262144):
result(dst aligned, src aligned): memcpy_fast=14ms memcpy=18 ms
result(dst aligned, src unalign): memcpy_fast=16ms memcpy=20 ms
result(dst unalign, src aligned): memcpy_fast=16ms memcpy=25 ms
result(dst unalign, src unalign): memcpy_fast=16ms memcpy=26 ms
benchmark(size=1048576 bytes, times=2048):
result(dst aligned, src aligned): memcpy_fast=72ms memcpy=64 ms
result(dst aligned, src unalign): memcpy_fast=72ms memcpy=64 ms
result(dst unalign, src aligned): memcpy_fast=75ms memcpy=63 ms
result(dst unalign, src unalign): memcpy_fast=71ms memcpy=66 ms
benchmark(size=4194304 bytes, times=512):
result(dst aligned, src aligned): memcpy_fast=67ms memcpy=79 ms
result(dst aligned, src unalign): memcpy_fast=77ms memcpy=80 ms
result(dst unalign, src aligned): memcpy_fast=80ms memcpy=76 ms
result(dst unalign, src unalign): memcpy_fast=72ms memcpy=82 ms
benchmark(size=8388608 bytes, times=256):
result(dst aligned, src aligned): memcpy_fast=98ms memcpy=126 ms
result(dst aligned, src unalign): memcpy_fast=92ms memcpy=137 ms
result(dst unalign, src aligned): memcpy_fast=85ms memcpy=136 ms
result(dst unalign, src unalign): memcpy_fast=101ms memcpy=146 ms
benchmark random access:
memcpy_fast=188ms memcpy=203ms
VisualStudio 2022 / VisualStudio 2022 preview
benchmark(size=32 bytes, times=16777216):
result(dst aligned, src aligned): memcpy_fast=32ms memcpy=28 ms
result(dst aligned, src unalign): memcpy_fast=31ms memcpy=30 ms
result(dst unalign, src aligned): memcpy_fast=32ms memcpy=29 ms
result(dst unalign, src unalign): memcpy_fast=31ms memcpy=29 ms
benchmark(size=64 bytes, times=16777216):
result(dst aligned, src aligned): memcpy_fast=33ms memcpy=55 ms
result(dst aligned, src unalign): memcpy_fast=34ms memcpy=56 ms
result(dst unalign, src aligned): memcpy_fast=33ms memcpy=55 ms
result(dst unalign, src unalign): memcpy_fast=33ms memcpy=55 ms
benchmark(size=512 bytes, times=8388608):
result(dst aligned, src aligned): memcpy_fast=46ms memcpy=43 ms
result(dst aligned, src unalign): memcpy_fast=46ms memcpy=42 ms
result(dst unalign, src aligned): memcpy_fast=43ms memcpy=44 ms
result(dst unalign, src unalign): memcpy_fast=45ms memcpy=45 ms
benchmark(size=1024 bytes, times=4194304):
result(dst aligned, src aligned): memcpy_fast=35ms memcpy=34 ms
result(dst aligned, src unalign): memcpy_fast=41ms memcpy=34 ms
result(dst unalign, src aligned): memcpy_fast=37ms memcpy=35 ms
result(dst unalign, src unalign): memcpy_fast=41ms memcpy=35 ms
benchmark(size=4096 bytes, times=524288):
result(dst aligned, src aligned): memcpy_fast=16ms memcpy=15 ms
result(dst aligned, src unalign): memcpy_fast=16ms memcpy=15 ms
result(dst unalign, src aligned): memcpy_fast=17ms memcpy=16 ms
result(dst unalign, src unalign): memcpy_fast=17ms memcpy=16 ms
benchmark(size=8192 bytes, times=262144):
result(dst aligned, src aligned): memcpy_fast=15ms memcpy=16 ms
result(dst aligned, src unalign): memcpy_fast=17ms memcpy=15 ms
result(dst unalign, src aligned): memcpy_fast=16ms memcpy=16 ms
result(dst unalign, src unalign): memcpy_fast=16ms memcpy=15 ms
benchmark(size=1048576 bytes, times=2048):
result(dst aligned, src aligned): memcpy_fast=71ms memcpy=61 ms
result(dst aligned, src unalign): memcpy_fast=74ms memcpy=69 ms
result(dst unalign, src aligned): memcpy_fast=72ms memcpy=62 ms
result(dst unalign, src unalign): memcpy_fast=73ms memcpy=65 ms
benchmark(size=4194304 bytes, times=512):
result(dst aligned, src aligned): memcpy_fast=68ms memcpy=68 ms
result(dst aligned, src unalign): memcpy_fast=73ms memcpy=72 ms
result(dst unalign, src aligned): memcpy_fast=83ms memcpy=77 ms
result(dst unalign, src unalign): memcpy_fast=69ms memcpy=69 ms
benchmark(size=8388608 bytes, times=256):
result(dst aligned, src aligned): memcpy_fast=89ms memcpy=94 ms
result(dst aligned, src unalign): memcpy_fast=91ms memcpy=83 ms
result(dst unalign, src aligned): memcpy_fast=87ms memcpy=88 ms
result(dst unalign, src unalign): memcpy_fast=108ms memcpy=87 ms
benchmark random access:
memcpy_fast=197ms memcpy=172ms
2019는 설치를 하지 않아서 테스트를 하지 못했습니다.
지금 다시 보니 2022에서 SIMD가 사용 되면서 빨라진 것 같네요.
다만 제가 개인적으로 만들어서 테스트 한 결과는 크기가 커지면 커질수록 더 벌어지는 걸로
봐서는 SIMD외의 최적화가 추가 된 것으로 보이네요.
그래서 결론은 반복 복사가 필요한 경우 Loop Unrolling 같은 기법이 효과가 훨씬 크다가 되겠습니다.