Skip to content

Commit 995ff49

Browse files
committed
finish the project, still with bugs
1 parent 38caa52 commit 995ff49

File tree

11 files changed

+570
-42
lines changed

11 files changed

+570
-42
lines changed

README.md

Lines changed: 93 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,6 +22,10 @@
2222
- [page\_cache内存释放](#page_cache内存释放)
2323
- [大于256k的情况](#大于256k的情况)
2424
- [处理代码中`new`的问题](#处理代码中new的问题)
25+
- [解决free,使其不用传大小](#解决free使其不用传大小)
26+
- [多线程场景下深度测试](#多线程场景下深度测试)
27+
- [分析性能瓶颈](#分析性能瓶颈)
28+
- [用Radix Tree进行优化](#用radix-tree进行优化)
2529

2630
***
2731

@@ -1196,4 +1200,92 @@ void page_cache::release_span_to_page(span* s) {
11961200

11971201
## 处理代码中`new`的问题
11981202

1199-
代码中有些地方用了`new span`。这个就很不对。我们弄这个tcmalloc是用来替代malloc的,既然是替代,那我们的代码里面怎么能有`new``new`也是调用`malloc`的,所以我们要改一下。
1203+
代码中有些地方用了`new span`。这个就很不对。我们弄这个tcmalloc是用来替代malloc的,既然是替代,那我们的代码里面怎么能有`new``new`也是调用`malloc`的,所以我们要改一下。
1204+
1205+
然后之前是写了一个定长内存池的,可以用来代替new。
1206+
1207+
**博客地址:[内存池是什么原理?|内存池简易模拟实现|为学习高并发内存池tcmalloc做准备](https://blog.csdn.net/Yu_Cblog/article/details/131741601)**
1208+
1209+
page_cache.hpp
1210+
```cpp
1211+
class page_cache {
1212+
private:
1213+
span_list __span_lists[PAGES_NUM];
1214+
static page_cache __s_inst;
1215+
page_cache() = default;
1216+
page_cache(const page_cache&) = delete;
1217+
std::unordered_map<PAGE_ID, span*> __id_span_map;
1218+
object_pool<span> __span_pool;
1219+
```
1220+
多加一个`object_pool<span> __span_pool;`对象。
1221+
1222+
然后,`new span`的地方都替换掉。`delete`的地方也换掉就行。
1223+
1224+
然后这里面也改一下。
1225+
1226+
tcmalloc.hpp
1227+
```cpp
1228+
static void* tcmalloc(size_t size) {
1229+
if (size > MAX_BYTES) {
1230+
// 处理申请大内存的情况
1231+
size_t align_size = size_class::round_up(size);
1232+
size_t k_page = align_size >> PAGE_SHIFT;
1233+
page_cache::get_instance()->__page_mtx.lock();
1234+
span* cur_span = page_cache::get_instance()->new_span(k_page); // 直接找pc
1235+
page_cache::get_instance()->__page_mtx.unlock();
1236+
void* ptr = (void*)(cur_span->__page_id << PAGE_SHIFT); // span转化成地址
1237+
return ptr;
1238+
}
1239+
if (p_tls_thread_cache == nullptr) {
1240+
// 相当于单例
1241+
// p_tls_thread_cache = new thread_cache;
1242+
static object_pool<thread_cache> tc_pool;
1243+
p_tls_thread_cache = tc_pool.new_();
1244+
}
1245+
#ifdef PROJECT_DEBUG
1246+
LOG(DEBUG) << "tcmalloc find tc from mem" << std::endl;
1247+
#endif
1248+
return p_tls_thread_cache->allocate(size);
1249+
}
1250+
```
1251+
1252+
## 解决free,使其不用传大小
1253+
1254+
因为我们已经有页号到span的映射了。所以我们在span里面增加一个字段,obj_size就行。
1255+
1256+
## 多线程场景下深度测试
1257+
1258+
**首先要明确一点,我们不是去造一个轮子,我们要和malloc对比,不是说要比malloc快多少,因为我们在很多细节上,和tcmalloc差的还是很远的。**
1259+
1260+
测试代码可以见bench\_mark.cc。
1261+
1262+
结果
1263+
```bash
1264+
parallels@ubuntu-linux-22-04-desktop:~/Project/Google-tcmalloc-simulation-implementation$ ./out
1265+
==========================================================
1266+
4个线程并发执行10轮次,每轮次concurrent alloc 1000次: 花费:27877 ms
1267+
4个线程并发执行10轮次,每轮次concurrent dealloc 1000次: 花费:52190 ms
1268+
4个线程并发concurrent alloc&dealloc 40000次,总计花费:80067 ms
1269+
1270+
1271+
4个线程并发执行10次,每轮次malloc 1000次: 花费:2227ms
1272+
4个线程并发执行10轮次,每轮次free 1000次: 花费:1385 ms
1273+
4个线程并发malloc&free 40000次,总计花费:3612 ms
1274+
==========================================================
1275+
parallels@ubuntu-linux-22-04-desktop:~/Project/Google-tcmalloc-simulation-implementation$
1276+
```
1277+
1278+
比malloc差。
1279+
1280+
## 分析性能瓶颈
1281+
1282+
linux和windows(VS STUDIO)下都有很多性能分析的工具,可以检测哪里调用的时间多。
1283+
1284+
在这里直接出结论:锁用了很多时间。
1285+
1286+
可以用基数树进行优化。
1287+
1288+
## 用Radix Tree进行优化
1289+
1290+
radix tree 我们可以直接用tcmalloc源码里面的。`page_map.hpp`
1291+

bench_mark.cc

Lines changed: 87 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,87 @@
1+
2+
3+
#include "./include/tcmalloc.hpp"
4+
#include <atomic>
5+
#include <thread>
6+
7+
// ntimes 一轮申请和释放内存的次数
8+
// rounds 轮次
9+
void BenchmarkMalloc(size_t ntimes, size_t nworks, size_t rounds) {
10+
std::vector<std::thread> vthread(nworks);
11+
std::atomic<size_t> malloc_costtime(0);
12+
std::atomic<size_t> free_costtime(0);
13+
for (size_t k = 0; k < nworks; ++k) {
14+
vthread[k] = std::thread([&, k]() {
15+
std::vector<void*> v;
16+
v.reserve(ntimes);
17+
for (size_t j = 0; j < rounds; ++j) {
18+
size_t begin1 = clock();
19+
for (size_t i = 0; i < ntimes; i++) {
20+
v.push_back(malloc(16));
21+
// v.push_back(malloc((16 + i) % 8192 + 1));
22+
}
23+
size_t end1 = clock();
24+
size_t begin2 = clock();
25+
for (size_t i = 0; i < ntimes; i++) {
26+
free(v[i]);
27+
}
28+
size_t end2 = clock();
29+
v.clear();
30+
malloc_costtime += (end1 - begin1);
31+
free_costtime += (end2 - begin2);
32+
}
33+
});
34+
}
35+
for (auto& t : vthread) {
36+
t.join();
37+
}
38+
std::cout << nworks << "个线程并发执行" << rounds << "次,每轮次malloc " << ntimes << "次: 花费:" << malloc_costtime.load() << "ms\n";
39+
std::cout << nworks << "个线程并发执行" << rounds << "轮次,每轮次free " << ntimes << "次: 花费:" << free_costtime.load() << " ms\n";
40+
std::cout << nworks << "个线程并发malloc&free " << nworks * rounds * ntimes << "次,总计花费:" << malloc_costtime.load() + free_costtime.load() << " ms\n";
41+
}
42+
43+
// 单轮次申请释放次数 线程数 轮次
44+
void BenchmarkConcurrentMalloc(size_t ntimes, size_t nworks, size_t rounds) {
45+
std::vector<std::thread> vthread(nworks);
46+
std::atomic<size_t> malloc_costtime(0);
47+
std::atomic<size_t> free_costtime(0);
48+
for (size_t k = 0; k < nworks; ++k) {
49+
vthread[k] = std::thread([&]() {
50+
std::vector<void*> v;
51+
v.reserve(ntimes);
52+
for (size_t j = 0; j < rounds; ++j) {
53+
size_t begin1 = clock();
54+
for (size_t i = 0; i < ntimes; i++) {
55+
v.push_back(tcmalloc(16));
56+
// v.push_back(ConcurrentAlloc((16 + i) % 8192 + 1));
57+
}
58+
size_t end1 = clock();
59+
size_t begin2 = clock();
60+
for (size_t i = 0; i < ntimes; i++) {
61+
tcfree(v[i]);
62+
}
63+
size_t end2 = clock();
64+
v.clear();
65+
malloc_costtime += (end1 - begin1);
66+
free_costtime += (end2 - begin2);
67+
}
68+
});
69+
}
70+
for (auto& t : vthread) {
71+
t.join();
72+
}
73+
std::cout << nworks << "个线程并发执行" << rounds << "轮次,每轮次concurrent alloc " << ntimes << "次: 花费:" << malloc_costtime.load() << " ms\n";
74+
std::cout << nworks << "个线程并发执行" << rounds << "轮次,每轮次concurrent dealloc " << ntimes << "次: 花费:" << free_costtime.load() << " ms\n";
75+
std::cout << nworks << "个线程并发concurrent alloc&dealloc " << nworks * rounds * ntimes << "次,总计花费:" << malloc_costtime.load() + free_costtime.load() << " ms\n";
76+
}
77+
78+
int main() {
79+
size_t n = 1000;
80+
std::cout << "==========================================================" << std::endl;
81+
BenchmarkConcurrentMalloc(n, 4, 10);
82+
// std::cout << std::endl
83+
// << std::endl;
84+
// BenchmarkMalloc(n, 4, 10);
85+
std::cout << "==========================================================" << std::endl;
86+
return 0;
87+
}

include/common.hpp

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -28,8 +28,10 @@ static const size_t PAGE_SHIFT = 13;
2828

2929
#if defined(_WIN64) || defined(__x86_64__) || defined(__ppc64__) || defined(__aarch64__)
3030
typedef unsigned long long PAGE_ID;
31+
#define SYS_BYTES 64
3132
#else
3233
typedef size_t PAGE_ID;
34+
#define SYS_BYTES 32
3335
#endif
3436

3537
inline static void* system_alloc(size_t kpage) {
@@ -199,6 +201,7 @@ class span {
199201
size_t __use_count = 0; // 切成段小块内存,被分配给threadCache的计数器
200202
void* __free_list = nullptr; // 切好的小块内存的自由链表
201203
bool __is_use = false; // 是否在被使用
204+
size_t __obj_size; // 切好的小对象的大小
202205
};
203206

204207
// 带头双向循环链表

include/object_pool.hpp

Lines changed: 53 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,53 @@
1+
2+
3+
#ifndef __YUFC_OBJECT_POOL_HPP__
4+
#define __YUFC_OBJECT_POOL_HPP__
5+
6+
#include <iostream>
7+
#include <vector>
8+
#include "./common.hpp"
9+
10+
#define __DEFAULT_KB__ 128
11+
12+
13+
14+
template <class T>
15+
class object_pool {
16+
private:
17+
char* __memory = nullptr; // char 方便切
18+
size_t __remain_bytes = 0; // 大块内存在切的过程中剩余的字节数
19+
void* __free_list = nullptr; // 还回来的时候形成的自由链表
20+
public:
21+
T* new_() {
22+
T* obj = nullptr;
23+
// 不够空间 首选是把还回来的内存块对象进行再次利用
24+
if (__free_list) {
25+
// 头删
26+
void* next = *((void**)__free_list);
27+
obj = (T*)__free_list;
28+
__free_list = next;
29+
return obj;
30+
}
31+
if (__remain_bytes < sizeof(T)) {
32+
// 空间不够了,要重新开一个空间
33+
__remain_bytes = __DEFAULT_KB__ * 1024;
34+
__memory = (char*)malloc(__remain_bytes);
35+
if (__memory == nullptr) {
36+
throw std::bad_alloc();
37+
}
38+
}
39+
obj = (T*)__memory;
40+
size_t obj_size = sizeof(T) < sizeof(void*) ? sizeof(void*) : sizeof(T);
41+
__memory += obj_size;
42+
__remain_bytes -= obj_size;
43+
new (obj) T;
44+
return obj;
45+
}
46+
void delete_(T* obj) {
47+
obj->~T();
48+
*(void**)obj = __free_list;
49+
__free_list = obj;
50+
}
51+
};
52+
53+
#endif

include/page_cache.hpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -4,14 +4,18 @@
44
#define __YUFC_PAGE_CACHE_HPP__
55

66
#include "./common.hpp"
7+
#include "./object_pool.hpp"
8+
#include "./page_map.hpp"
79

810
class page_cache {
911
private:
1012
span_list __span_lists[PAGES_NUM];
1113
static page_cache __s_inst;
1214
page_cache() = default;
1315
page_cache(const page_cache&) = delete;
14-
std::unordered_map<PAGE_ID, span*> __id_span_map;
16+
// std::unordered_map<PAGE_ID, span*> __id_span_map;
17+
TCMalloc_PageMap3<SYS_BYTES - PAGE_SHIFT> __id_span_map;
18+
object_pool<span> __span_pool;
1519

1620
public:
1721
std::mutex __page_mtx;
@@ -21,6 +25,7 @@ class page_cache {
2125
span* map_obj_to_span(void* obj);
2226
// 释放空闲的span回到pc,并合并相邻的span
2327
void release_span_to_page(span* s, size_t size = 0);
28+
2429
public:
2530
// 获取一个K页的span
2631
span* new_span(size_t k);

0 commit comments

Comments
 (0)