Reduce peak commit on memory-alloc intensive apps #150

aganea · 2020-02-04T01:12:53Z

I've recently integrated rpmalloc and mimalloc into LLVM, please see thread: https://reviews.llvm.org/D71786
I discovered along the way that rpmalloc takes more memory than mimalloc when linking with LLD & ThinLTO. For example:

	              | Working Set (B) | Private Working Set (B) | Commit (B) | Virtual Size (B)
rpmalloc - 36-threads |     25.1 GB     |          16.5 GB        |   19.9 GB  |      37.4 GB
mimalloc - 36-threads |     25.6 GB     |          16.3 GB        |   18.3 GB  |      33.3 GB
rpmalloc - 72-threads |     33.6 GB     |          25.1 GB        |   28.5 GB  |      46 GB
mimalloc - 72-threads |     30.5 GB     |          21.2 GB        |   23.4 GB  |      38.4 GB

There's a difference in terms of execution time, in favor of mimalloc. It seems the difference is proportional to the difference of the commit size between the two.

To repro (windows bash, but you could probably repro all this on Linux as well),

$ git clone https://github.com/llvm/llvm-project.git
# Download patch from https://reviews.llvm.org/D71786
$ git apply D71786.txt

# ROOT is where LLVM was checked out by git clone above, modify accordingly
$ set ROOT=d:/llvm-project
$ set LLVM=c:/Program Files/LLVM

# Ensure cmake, python 3.7, gnuWin32, git, ninja build and a LLVM package (llvm.org) are installed first.
$ cd %ROOT%
$ mkdir stage1
$ cd stage1

# Feel free to fiddle the following flags according to your hardware config
$ set OPT_AVX=/GS- /D_ITERATOR_DEBUG_LEVEL=0 /arch:AVX
$ set OPT_SKYLAKE=/GS- /D_ITERATOR_DEBUG_LEVEL=0 -Xclang -O3 -Xclang -fwhole-program-vtables -fstrict-aliasing -march=skylake-avx512

$ cmake -GNinja %ROOT%/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_OPTIMIZED_TABLEGEN=ON -DLLVM_ENABLE_ASSERTIONS=ON -DLLVM_ENABLE_LIBXML2=OFF -DCMAKE_C_COMPILER="%LLVM%/bin/clang-cl.EXE" -DCMAKE_CXX_COMPILER="%LLVM%/bin/clang-cl.EXE" -DCMAKE_LINKER="%LLVM%/bin/lld-link.EXE" -DLLVM_ENABLE_PROJECTS="llvm;clang;lld" -DLLVM_ENABLE_PDB=ON -DLLVM_ENABLE_LLD=ON -DLLVM_USE_CRT_RELEASE=MT -DCMAKE_CXX_FLAGS="%OPT_AVX%" -DCMAKE_C_FLAGS="%OPT_AVX%" 

$ ninja check-all
# This should yield no errors, or if it does, they were there before on trunk

# Now build the stage2:
$ cd %ROOT%
$ mkdir stage2
$ cd stage2

$ set LLVM_LOCAL=%ROOT%/stage1
$ cmake -G"Ninja" %ROOT%/llvm -DCMAKE_BUILD_TYPE=Release -DLLVM_OPTIMIZED_TABLEGEN=true -DLLVM_ENABLE_LIBXML2=OFF -DLLVM_USE_CRT_RELEASE=MT -DCMAKE_C_COMPILER="%LLVM_LOCAL%/bin/clang-cl.exe" -DCMAKE_CXX_COMPILER="%LLVM_LOCAL%/bin/clang-cl.exe" -DCMAKE_LINKER="%LLVM_LOCAL%/bin/lld-link.exe" -DLLVM_ENABLE_LLD=ON -DLLVM_ENABLE_PDB=ON -DLLVM_ENABLE_PROJECTS="llvm;clang;lld" -DCMAKE_CXX_FLAGS="%OPT_SKYLAKE%" -DCMAKE_C_FLAGS="%OPT_SKYLAKE%" -DLLVM_ENABLE_LTO=THIN -DCLANG_TABLEGEN="%LLVM_LOCAL%/bin/clang-tblgen.exe" -DLLVM_TABLEGEN="%LLVM_LOCAL%/bin/llvm-tblgen.exe"

$ ninja check-all
# This should take a lot longer, because we're now building the LLVM .exes with ThinLTO.
# Ensure you've got at least 150 GB free on the SSD. The ThinLTO cache takes a lot of space.

# Prepare for the test (pwd is still in the stage2 folder)
$ rm bin\clang.exe
$ ninja clang -v
# This will print the cmd-line to use to link clang. Copy-paste it in a file stage2\link.rsp.
# While the above ninja cmd-line links, duplicate stage2\CMakeFiles\clang.rsp to another file, say clang2.rsp. This is a temp file which is deleted once linking ends.
# Reference clang2.rsp instead of clang.rsp from stage2\link.rsp
# Ensure you remove the LTO cache flag from link.rsp

$ bin\lld-link @link.rsp /time
# This is your final test, which will use the stage2 lld-link.exe to link the stage2 clang.exe.
# Try it once to see the time it takes. You would probably want to re-run it with rpmalloc's stats enabled.

To compare with mimalloc, you'd need to compile first mimalloc as a static lib (disable /GL).
You can reference it then in place of rpmalloc, by using the following patch (simply revert this file from the previous patch, before applying):

diff --git a/llvm/lib/Support/CMakeLists.txt b/llvm/lib/Support/CMakeLists.txt
index 26332d4f539..77c7645592c 100644
--- a/llvm/lib/Support/CMakeLists.txt
+++ b/llvm/lib/Support/CMakeLists.txt
@@ -51,6 +51,31 @@ else()
   set(Z3_LINK_FILES "")
 endif()
 
+# if(LLVM_ENABLE_RPMALLOC)
+#   set(RPMALLOC_FILES rpmalloc/rpmalloc.c)
+# else()
+#   set(RPMALLOC_FILES "")
+# endif()
+set(ALLOC_BENCH_PATH "D:/git/rpmalloc-benchmark/benchmark/")
+
+# mimalloc
+set(ALLOCATOR_FILES "${ALLOC_BENCH_PATH}mimalloc/benchmark.c")
+set(ALLOCATOR_INCLUDES "${ALLOC_BENCH_PATH}mimalloc/include/" "${ALLOC_BENCH_PATH}")
+set(system_libs ${system_libs} "D:/git/mimalloc/out/msvc-x64/Release/mimalloc-static.lib" "-INCLUDE:malloc")
+
+# rpmalloc
+# set(ALLOCATOR_FILES "${ALLOC_BENCH_PATH}rpmalloc/benchmark.c" "${ALLOC_BENCH_PATH}rpmalloc/rpmalloc.c")
+# set(ALLOCATOR_INCLUDES "${ALLOC_BENCH_PATH}rpmalloc/" "${ALLOC_BENCH_PATH}")
+
+# tcmalloc
+# set(ALLOCATOR_FILES "${ALLOC_BENCH_PATH}gperftools/benchmark.c")
+# set(ALLOCATOR_INCLUDES "${ALLOC_BENCH_PATH}gperftools/" "${ALLOC_BENCH_PATH}")
+# set(system_libs ${system_libs} "D:/git/rpmalloc-benchmark/benchmark/gperftools/x64/Release-Override/libtcmalloc_minimal.lib" "-INCLUDE:malloc")
+
+# ptmalloc3
+# set(ALLOCATOR_FILES "${ALLOC_BENCH_PATH}ptmalloc3/benchmark.c" "${ALLOC_BENCH_PATH}ptmalloc3/malloc.c" "${ALLOC_BENCH_PATH}ptmalloc3/ptmalloc3.c")
+# set(ALLOCATOR_INCLUDES "${ALLOC_BENCH_PATH}ptmalloc3/" "${ALLOC_BENCH_PATH}" "${ALLOC_BENCH_PATH}ptmalloc3/sysdeps/windows")
+
 add_llvm_component_library(LLVMSupport
   AArch64TargetParser.cpp
   ABIBreak.cpp
@@ -163,6 +188,8 @@ add_llvm_component_library(LLVMSupport
   xxhash.cpp
   Z3Solver.cpp
 
+  ${ALLOCATOR_FILES}
+
 # System
   Atomic.cpp
   DynamicLibrary.cpp
@@ -197,3 +224,8 @@ if(LLVM_WITH_Z3)
     ${Z3_INCLUDE_DIR}
     )
 endif()
+
+  target_include_directories(LLVMSupport SYSTEM
+   PRIVATE
+   ${ALLOCATOR_INCLUDES}
+   )
\ No newline at end of file

You don't need to rebuild stage1, only stage2. You don't need to call cmake again, you can simply call ninja all -C stage2 after applying the mimalloc modification above. You can then switch between rpmalloc and mimalloc by commenting-out the relevant sections in this file, and re-running ninja.

At this point, you should see a difference in terms of peak Committed memory. I'm using UIforETW (https://github.com/google/UIforETW) to take profiles on Windows.

You can probably repro this on Linux as well, and maybe linking a smaller program instead of clang.exe if you want faster iteration. Please don't hesitate to poke me by email if any of these doesn't work or if you're stuck.

The text was updated successfully, but these errors were encountered:

mjansson · 2020-02-04T08:28:59Z

Thank you , I will dive right into this and see what I can find (and improve)

aganea · 2020-03-10T13:13:32Z

Ping! I'm wondering if you had any time to look into this? (at least I wanted to know if there's a potential for improvement)
Many thanks again!

mjansson · 2020-03-15T11:21:23Z

Yeah, I've started looking into it but been occupied with other things lately. I will try to get back to this soon.

mjansson · 2020-03-24T19:44:37Z

I've gotten to the point where I'm going to start comparing some changes to rpmalloc, but I can't seem to correctly setup my environment. When trying to bootstrap the llvm repo, running the initial cmake in stage1 I get

-- Check for working C compiler: D:/dev/llvm/bin/clang-cl.EXE - broken
CMake Error at D:/dev/cmake/share/cmake-3.17/Modules/CMakeTestCCompiler.cmake:60 (message):
  The C compiler

    "D:/dev/llvm/bin/clang-cl.EXE"

  is not able to compile a simple test program.

  It fails with the following output:

    Change Dir: D:/llvm-project/stage1/CMakeFiles/CMakeTmp

    Run Build Command(s):C:/tools/ninja.exe cmTC_54168 && [1/2] Building C object CMakeFiles\cmTC_54168.dir\testCCompiler.c.obj
    [2/2] Linking C executable cmTC_54168.exe
    FAILED: cmd.exe /C "cd . && D:\dev\cmake\bin\cmake.exe -E vs_link_exe --intdir=CMakeFiles\cmTC_54168.dir --rc=C:\PROGRA~2\WI3CF2~1\10\bin\100183~1.0\x86\rc.exe --mt=C:\PROGRA~2\WI3CF2~1\10\bin\100183~1.0\x86\mt.exe --manifests  -- D:\dev\LLVM\bin\lld-link.exe /nologo CMakeFiles\cmTC_54168.dir\testCCompiler.c.obj  /out:cmTC_54168.exe /implib:cmTC_54168.lib /pdb:cmTC_54168.pdb /version:0.0  /machine:x64  /debug /INCREMENTAL /subsystem:console  kernel32.lib user32.lib gdi32.lib winspool.lib shell32.lib ole32.lib oleaut32.lib uuid.lib comdlg32.lib advapi32.lib && cd ."
    LINK Pass 1: command "D:\dev\LLVM\bin\lld-link.exe /nologo CMakeFiles\cmTC_54168.dir\testCCompiler.c.obj /out:cmTC_54168.exe /implib:cmTC_54168.lib /pdb:cmTC_54168.pdb /version:0.0 /machine:x64 /debug /INCREMENTAL /subsystem:console kernel32.lib user32.lib gdi32.lib winspool.lib shell32.lib ole32.lib oleaut32.lib uuid.lib comdlg32.lib advapi32.lib /MANIFEST /MANIFESTFILE:CMakeFiles\cmTC_54168.dir/intermediate.manifest CMakeFiles\cmTC_54168.dir/manifest.res" failed (exit code 1) with the following output:
    lld-link: error: <root>: undefined symbol: mainCRTStartup
    ninja: build stopped: subcommand failed.

aganea · 2020-03-24T20:13:42Z

Are you running inside a Visual Studio cmd shell? (x64 Native Tools Command Prompt for VS 2019)
Can you try compiling & linking a simple hello world see if that links? (inside the VS shell)
You can also use MSVC for the first stage, you don't have to use LLVM. Just leave out -DCMAKE_C_COMPILER, -DCMAKE_CXX_COMPILER, -DCMAKE_LINKER and -DLLVM_ENABLE_LLD in that case.
Or maybe we can setup a Teams channel and chat?

mjansson · 2020-03-25T15:16:30Z

I tried using MSVC as well, but then it fails when building the ninja check-all with

[575/4773] Linking CXX shared library unittests\Support\DynamicLibrary\PipSqueak.dll
FAILED: cmd.exe /C "cd . && D:\dev\cmake\bin\cmake.exe -E vs_link_dll --intdir=unittests\Support\DynamicLibrary\CMakeFiles\PipSqueak.dir --rc=C:\PROGRA~2\WI3CF2~1\10\bin\100183~1.0\x86\rc.exe --mt=C:\PROGRA~2\WI3CF2~1\10\bin\100183~1.0\x86\mt.exe --manifests  -- D:\dev\visualstudio\VC\Tools\MSVC\14.25.28610\bin\Hostx86\x86\link.exe /nologo unittests\Support\DynamicLibrary\CMakeFiles\PipSqueak.dir\PipSqueak.cpp.obj  /out:unittests\Support\DynamicLibrary\PipSqueak.dll /implib:unittests\Support\DynamicLibrary\PipSqueak.lib /pdb:unittests\Support\DynamicLibrary\PipSqueak.pdb /dll /version:0.0 /machine:X86 /DEBUG /OPT:REF /OPT:ICF /INCREMENTAL:NO  lib\LLVMSupport.lib  -INCLUDE:malloc  psapi.lib  shell32.lib  ole32.lib  uuid.lib  advapi32.lib  delayimp.lib  -delayload:shell32.dll  -delayload:ole32.dll  lib\LLVMDemangle.lib  kernel32.lib user32.lib gdi32.lib winspool.lib shell32.lib ole32.lib oleaut32.lib uuid.lib comdlg32.lib advapi32.lib  && cd ."
LINK: command "D:\dev\visualstudio\VC\Tools\MSVC\14.25.28610\bin\Hostx86\x86\link.exe /nologo unittests\Support\DynamicLibrary\CMakeFiles\PipSqueak.dir\PipSqueak.cpp.obj /out:unittests\Support\DynamicLibrary\PipSqueak.dll /implib:unittests\Support\DynamicLibrary\PipSqueak.lib /pdb:unittests\Support\DynamicLibrary\PipSqueak.pdb /dll /version:0.0 /machine:X86 /DEBUG /OPT:REF /OPT:ICF /INCREMENTAL:NO lib\LLVMSupport.lib -INCLUDE:malloc psapi.lib shell32.lib ole32.lib uuid.lib advapi32.lib delayimp.lib -delayload:shell32.dll -delayload:ole32.dll lib\LLVMDemangle.lib kernel32.lib user32.lib gdi32.lib winspool.lib shell32.lib ole32.lib oleaut32.lib uuid.lib comdlg32.lib advapi32.lib /MANIFEST /MANIFESTFILE:unittests\Support\DynamicLibrary\PipSqueak.dll.manifest" failed (exit code 1120) with the following output:
   Creating library unittests\Support\DynamicLibrary\PipSqueak.lib and object unittests\Support\DynamicLibrary\PipSqueak.exp
LINK : warning LNK4199: /DELAYLOAD:shell32.dll ignored; no imports found from shell32.dll
LINK : warning LNK4199: /DELAYLOAD:ole32.dll ignored; no imports found from ole32.dll
LINK : error LNK2001: unresolved external symbol malloc
unittests\Support\DynamicLibrary\PipSqueak.dll : fatal error LNK1120: 1 unresolved externals

aganea · 2020-03-25T15:34:55Z

At this point, it should pick up the malloc symbol in rpmalloc. My patch on phabricator might not be complete, sorry about that.

Is ENABLE_OVERRIDE and ENABLE_RELOAD set to 1 in the rpmalloc.c file inside LLVM?
If that doesn't work, is rpmalloc's malloc.c part of llvm/lib/Support/CMakeLists.txt? (add it to the line set(RPMALLOC_FILES rpmalloc/rpmalloc.c)).
Try running that link command above with /VERBOSE see where it is looking for the malloc symbol.

mjansson · 2020-03-26T21:09:10Z

Uh, I came back to this today and realized it was just me being completely stupid, was running the wrong command shell (it was setup for x86 MSVC environment, not x86-64)

mjansson · 2020-03-26T21:56:04Z

Seems I'm on track once again, stage1 completed and will continue this investigation tomorrow

mjansson · 2020-03-27T14:52:09Z

Got usable timing and dumps from stage2, investigating and trying out some ideas.

aganea · 2020-03-27T17:39:54Z

Great news! Many thanks again for taking the time on this ;-)

mjansson · 2020-04-19T17:45:56Z

I've made some progress on this, I hope to have something ready in a few days

mjansson · 2020-04-21T10:22:00Z

@aganea I'm having issues trying to reduce the small block granularity from 16 to 8 bytes to see if that causes the additional memory usage, but it seems the lld-link.exe assumes allocations are 16 byte aligned? Running lld-link.exe with rpmalloc giving back 8-byte aligned memory blocks it bails out when trying to zero out memory, the disassembly looks like

00007FF628741C43  mov         ecx,88h  
00007FF628741C48  call        operator new (07FF62A26AAACh)  
00007FF628741C4D  lea         rcx,[rax+80h]  
00007FF628741C54  movaps      xmmword ptr [rax+70h],xmm6  
00007FF628741C58  movaps      xmmword ptr [rax],xmm6  
00007FF628741C5B  movaps      xmmword ptr [rax+10h],xmm6  
00007FF628741C5F  movaps      xmmword ptr [rax+20h],xmm6  
00007FF628741C63  movaps      xmmword ptr [rax+30h],xmm6  
00007FF628741C67  movaps      xmmword ptr [rax+40h],xmm6  
00007FF628741C6B  movaps      xmmword ptr [rax+50h],xmm6  
00007FF628741C6F  movaps      xmmword ptr [rax+60h],xmm6  
00007FF628741C73  mov         qword ptr [rax+80h],0  
00007FF628741C7E  mov         qword ptr [rax+70h],rcx

It bails on the 00007FF628741C54 movaps xmmword ptr [rax+70h],xmm6 since the argument is not 16-byte aligned, but how can it assume the normal new operator will return 16-byte aligned blocks?

Do you have any insights into this? I thought mimalloc had 8 bytes granularity/alignment as well.

aganea · 2020-04-21T14:13:21Z

I think because the X86_64 ABI defines __STDCPP_DEFAULT_NEW_ALIGNMENT__ = 16 (see here) which is controlled through -fnew-alignement=, and that LLVM/Clang/LLD compiles with -std:c++14. Maybe add this to set OPT_SKYLAKE=... -fnew-alignement=8 and add -CMAKE_CXX_STANDARD=17 on the cmake cmd-line?

mjansson · 2020-04-21T14:14:52Z

I guess I must have missed something when looking at the mimalloc source then, I got the impression it had a natural block alignment of sizeof(uintptr_t)

mjansson · 2020-04-21T14:40:32Z

@aganea could you try the mjansson/llvm-opt branch and see what results you get on your end? You will have to change the enable preload/overload defines to 1 I guess, otherwise it should be good to go inside llvm

aganea · 2020-04-21T14:55:32Z

Great! I will do that and get back.

aganea · 2020-04-21T14:57:01Z

Do you think with -std=c++17 and 'aligned new', things could be improved?

mjansson · 2020-04-21T15:17:46Z

I think it's better to leave it to 16 byte and have that as natural granularity in the allocator, to allow the optimizer to utilize sse instruction freely

mjansson · 2020-04-21T16:44:59Z

Btw this branch will probably not reduce peak commit, but it will hopefully make it faster :)

aganea · 2020-05-06T22:00:33Z

I did some test with the mjansson/llvm-opt branch. It seems the performance is better overall, there's a consistent decrease in link times with LLD (-10 sec over 148 sec of link times), to about the same times as mimalloc. However there's a slightly increase of the commit size. Overall it looks good, I'll do more tests and update the LLVM review. I've also tested snmalloc, will post that (overall all are in the same ballpark in terms of performance, but mimalloc generally does better in terms of commit, rpmalloc is behind, then comes snmalloc which has the highest commit usage)

mjansson · 2020-05-07T18:36:12Z

I'll take another look at this and see if I can tune the sizes of class buckets to reduce the overhead and in the end the total commit.

aganea · 2020-05-08T01:43:30Z

The profile trace shows about the same number of cycles being executed on both cases. The wall time fluctuates a lot because of the ThinLTO threading. I thought timings were better, but it's about the same. I'll run a ABBA test overnight just to see the median times.

Old rpmalloc, on develop but I cant remember at which checkout:

New rpmalloc, llvm-opt branch:

As you can see, the cumulated thread time is about the same (6,382 seconds). However your newer version does less commits/decommits (197,747) vs. the older one (221,359) and it also seems to touch less memory pages (43 GB) vs the old one (64GB). However the committed bytes are a bit higher like I mentionned, 33 GB commited bytes with llvm-opt, 29 GB with develop.

Now the competition:

snmalloc:

snmalloc seems to commit much larger memory ranges in general, doing less operations, but at the same time it has much higher committed bytes overall (43 GB) vs. rpmalloc llvm-opt (33GB). The CPU time is +250 sec cumulated thread time vs. rpmalloc.

mimalloc

mimalloc seems to have the best ressource compromise overall: only 24 GB commited bytes, however a huge amount of page operations (1,237,652) in a whooping 111 GB virtual memory touched. CPU time is roughly +100 sec cumulated thread time, mainly because of time spent in the kernel doing all the page operations.

In essence:

|                   | OS page operations | touched virtual memory | commited |
| rpmalloc-develop  | 221,259            | 64 GB                  | 29 GB    |
| rpmalloc-llvm-opt | 195,747            | 44 GB                  | 33 GB    |
| snmalloc          | 100,815            | 46 GB                  | 43 GB    |
| mimalloc          | 1,237,652          | 112 GB                 | 24 GB    |

I tested on Windows 10 version 1909, on a 36-core dual Intel Gold 6140, 128 GB RAM, with RAID-0 SSDs. I will run more tests on a 6-core to see if there's any difference, and I will check how are the build+link times with a few of our games.

aganea · 2020-05-08T12:44:23Z

Please see the result from last night's ABBA test. It ran for 10 hours. It seems llvm-opt saves about 2 sec linktime on the 36-core. I'll run the test again on the 6-core, the difference should be higher I imagine:

Total iterations: 128

     |         Min       |        Mean       |       Median      |         Max       |
   A |  00:02:17.7783925 |  00:02:22.4748098 |  00:02:22.5980236 |  00:02:30.7857265 | old rpmalloc
   B |  00:02:16.7175005 |  00:02:20.2233198 |  00:02:20.2000373 |  00:02:27.2670286 | new rpmalloc
Diff | -00:00:01.0608920 | -00:00:02.2514900 | -00:00:02.3979863 | -00:00:03.5186979 |

aganea · 2020-05-09T15:34:09Z

10-hour test on the 36-core with snmall and mimalloc. Same conditions as above.

Total iterations: 123

     |         Min       |        Mean       |       Median      |         Max       |
   A |  00:02:23.4889505 |  00:02:28.9089909 |  00:02:28.7306476 |  00:02:37.1696736 | snmalloc
   B |  00:02:20.6711606 |  00:02:25.2934670 |  00:02:25.3325200 |  00:02:33.8035443 | mimalloc
Diff | -00:00:02.8177899 | -00:00:03.6155239 | -00:00:03.3981276 | -00:00:03.3661293 |

mjansson · 2020-07-02T07:27:47Z

So if I'm reading the latest numbers here (and from your last comment on https://reviews.llvm.org/D71786) it seems wall clock time performance of rpmalloc is now on par with the others (mimalloc, snmalloc) and I should focus efforts on the peak commit?

aganea · 2020-07-02T14:36:03Z

Yes indeed! I could send by email the allocation pattern for all three allocators. Like mentionned previously, it seems mimalloc does lots of commits in very small ranges. But perhaps the internal structures are important as well.

Do you think you can sort out something with the licence? Seems like it's the most contentious issue. Does git support switching the licence to MIT along the lifetime of a repo? Or is it fixed once the repro is created?

mjansson · 2020-07-02T15:09:39Z

rpmalloc has been dual licensed for quite some time, either released to the public domain, or if you are a grumpy lawyer, under MIT license.

I now noticed the LICENSE file had not been updated (or lost in some merge) and only the README had the dual license. I've updated the LICENSE file now to include the MIT option as well.

mjansson · 2020-07-05T08:48:00Z

I've reworked the caches a bit in branch mjansson/array-cache - give it a try and see what impact it has on performance and peak commit

mjansson self-assigned this Feb 4, 2020

mjansson added the performance label Feb 4, 2020

mjansson added this to the 1.4.1 milestone Feb 4, 2020

aganea mentioned this issue May 6, 2020

Compile-time fixes for Clang 10 microsoft/snmalloc#180

Closed

aganea mentioned this issue May 8, 2020

Crash on realloc microsoft/snmalloc#175

Closed

mjansson closed this as completed Aug 27, 2020

aganea mentioned this issue Feb 3, 2023

[perf] Test mimalloc 2.0.6 rust-lang/rust#103944

Closed

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Reduce peak commit on memory-alloc intensive apps #150

Reduce peak commit on memory-alloc intensive apps #150

aganea commented Feb 4, 2020

mjansson commented Feb 4, 2020

aganea commented Mar 10, 2020

mjansson commented Mar 15, 2020

mjansson commented Mar 24, 2020 •

edited

Loading

aganea commented Mar 24, 2020

mjansson commented Mar 25, 2020

aganea commented Mar 25, 2020

mjansson commented Mar 26, 2020

mjansson commented Mar 26, 2020

mjansson commented Mar 27, 2020

aganea commented Mar 27, 2020

mjansson commented Apr 19, 2020

mjansson commented Apr 21, 2020

aganea commented Apr 21, 2020

mjansson commented Apr 21, 2020

mjansson commented Apr 21, 2020

aganea commented Apr 21, 2020

aganea commented Apr 21, 2020

mjansson commented Apr 21, 2020

mjansson commented Apr 21, 2020

aganea commented May 6, 2020 •

edited

Loading

mjansson commented May 7, 2020

aganea commented May 8, 2020 •

edited

Loading

aganea commented May 8, 2020

aganea commented May 9, 2020

mjansson commented Jul 2, 2020

aganea commented Jul 2, 2020

mjansson commented Jul 2, 2020

mjansson commented Jul 5, 2020

Reduce peak commit on memory-alloc intensive apps #150

Reduce peak commit on memory-alloc intensive apps #150

Comments

aganea commented Feb 4, 2020

mjansson commented Feb 4, 2020

aganea commented Mar 10, 2020

mjansson commented Mar 15, 2020

mjansson commented Mar 24, 2020 • edited Loading

aganea commented Mar 24, 2020

mjansson commented Mar 25, 2020

aganea commented Mar 25, 2020

mjansson commented Mar 26, 2020

mjansson commented Mar 26, 2020

mjansson commented Mar 27, 2020

aganea commented Mar 27, 2020

mjansson commented Apr 19, 2020

mjansson commented Apr 21, 2020

aganea commented Apr 21, 2020

mjansson commented Apr 21, 2020

mjansson commented Apr 21, 2020

aganea commented Apr 21, 2020

aganea commented Apr 21, 2020

mjansson commented Apr 21, 2020

mjansson commented Apr 21, 2020

aganea commented May 6, 2020 • edited Loading

mjansson commented May 7, 2020

aganea commented May 8, 2020 • edited Loading

aganea commented May 8, 2020

aganea commented May 9, 2020

mjansson commented Jul 2, 2020

aganea commented Jul 2, 2020

mjansson commented Jul 2, 2020

mjansson commented Jul 5, 2020

mjansson commented Mar 24, 2020 •

edited

Loading

aganea commented May 6, 2020 •

edited

Loading

aganea commented May 8, 2020 •

edited

Loading