hbm.c (13944B)
1// SPDX-License-Identifier: GPL-2.0 2/* Copyright (c) 2019 Facebook 3 * 4 * This program is free software; you can redistribute it and/or 5 * modify it under the terms of version 2 of the GNU General Public 6 * License as published by the Free Software Foundation. 7 * 8 * Example program for Host Bandwidth Managment 9 * 10 * This program loads a cgroup skb BPF program to enforce cgroup output 11 * (egress) or input (ingress) bandwidth limits. 12 * 13 * USAGE: hbm [-d] [-l] [-n <id>] [-r <rate>] [-s] [-t <secs>] [-w] [-h] [prog] 14 * Where: 15 * -d Print BPF trace debug buffer 16 * -l Also limit flows doing loopback 17 * -n <#> To create cgroup \"/hbm#\" and attach prog 18 * Default is /hbm1 19 * --no_cn Do not return cn notifications 20 * -r <rate> Rate limit in Mbps 21 * -s Get HBM stats (marked, dropped, etc.) 22 * -t <time> Exit after specified seconds (default is 0) 23 * -w Work conserving flag. cgroup can increase its bandwidth 24 * beyond the rate limit specified while there is available 25 * bandwidth. Current implementation assumes there is only 26 * NIC (eth0), but can be extended to support multiple NICs. 27 * Currrently only supported for egress. 28 * -h Print this info 29 * prog BPF program file name. Name defaults to hbm_out_kern.o 30 */ 31 32#define _GNU_SOURCE 33 34#include <stdio.h> 35#include <stdlib.h> 36#include <assert.h> 37#include <sys/time.h> 38#include <unistd.h> 39#include <errno.h> 40#include <fcntl.h> 41#include <linux/unistd.h> 42#include <linux/compiler.h> 43 44#include <linux/bpf.h> 45#include <bpf/bpf.h> 46#include <getopt.h> 47 48#include "cgroup_helpers.h" 49#include "hbm.h" 50#include "bpf_util.h" 51#include <bpf/libbpf.h> 52 53bool outFlag = true; 54int minRate = 1000; /* cgroup rate limit in Mbps */ 55int rate = 1000; /* can grow if rate conserving is enabled */ 56int dur = 1; 57bool stats_flag; 58bool loopback_flag; 59bool debugFlag; 60bool work_conserving_flag; 61bool no_cn_flag; 62bool edt_flag; 63 64static void Usage(void); 65static void read_trace_pipe2(void); 66static void do_error(char *msg, bool errno_flag); 67 68#define DEBUGFS "/sys/kernel/debug/tracing/" 69 70static struct bpf_program *bpf_prog; 71static struct bpf_object *obj; 72static int queue_stats_fd; 73 74static void read_trace_pipe2(void) 75{ 76 int trace_fd; 77 FILE *outf; 78 char *outFname = "hbm_out.log"; 79 80 trace_fd = open(DEBUGFS "trace_pipe", O_RDONLY, 0); 81 if (trace_fd < 0) { 82 printf("Error opening trace_pipe\n"); 83 return; 84 } 85 86// Future support of ingress 87// if (!outFlag) 88// outFname = "hbm_in.log"; 89 outf = fopen(outFname, "w"); 90 91 if (outf == NULL) 92 printf("Error creating %s\n", outFname); 93 94 while (1) { 95 static char buf[4097]; 96 ssize_t sz; 97 98 sz = read(trace_fd, buf, sizeof(buf) - 1); 99 if (sz > 0) { 100 buf[sz] = 0; 101 puts(buf); 102 if (outf != NULL) { 103 fprintf(outf, "%s\n", buf); 104 fflush(outf); 105 } 106 } 107 } 108} 109 110static void do_error(char *msg, bool errno_flag) 111{ 112 if (errno_flag) 113 printf("ERROR: %s, errno: %d\n", msg, errno); 114 else 115 printf("ERROR: %s\n", msg); 116 exit(1); 117} 118 119static int prog_load(char *prog) 120{ 121 struct bpf_program *pos; 122 const char *sec_name; 123 124 obj = bpf_object__open_file(prog, NULL); 125 if (libbpf_get_error(obj)) { 126 printf("ERROR: opening BPF object file failed\n"); 127 return 1; 128 } 129 130 /* load BPF program */ 131 if (bpf_object__load(obj)) { 132 printf("ERROR: loading BPF object file failed\n"); 133 goto err; 134 } 135 136 bpf_object__for_each_program(pos, obj) { 137 sec_name = bpf_program__section_name(pos); 138 if (sec_name && !strcmp(sec_name, "cgroup_skb/egress")) { 139 bpf_prog = pos; 140 break; 141 } 142 } 143 if (!bpf_prog) { 144 printf("ERROR: finding a prog in obj file failed\n"); 145 goto err; 146 } 147 148 queue_stats_fd = bpf_object__find_map_fd_by_name(obj, "queue_stats"); 149 if (queue_stats_fd < 0) { 150 printf("ERROR: finding a map in obj file failed\n"); 151 goto err; 152 } 153 154 return 0; 155 156err: 157 bpf_object__close(obj); 158 return 1; 159} 160 161static int run_bpf_prog(char *prog, int cg_id) 162{ 163 struct hbm_queue_stats qstats = {0}; 164 char cg_dir[100], cg_pin_path[100]; 165 struct bpf_link *link = NULL; 166 int key = 0; 167 int cg1 = 0; 168 int rc = 0; 169 170 sprintf(cg_dir, "/hbm%d", cg_id); 171 rc = prog_load(prog); 172 if (rc != 0) 173 return rc; 174 175 if (setup_cgroup_environment()) { 176 printf("ERROR: setting cgroup environment\n"); 177 goto err; 178 } 179 cg1 = create_and_get_cgroup(cg_dir); 180 if (!cg1) { 181 printf("ERROR: create_and_get_cgroup\n"); 182 goto err; 183 } 184 if (join_cgroup(cg_dir)) { 185 printf("ERROR: join_cgroup\n"); 186 goto err; 187 } 188 189 qstats.rate = rate; 190 qstats.stats = stats_flag ? 1 : 0; 191 qstats.loopback = loopback_flag ? 1 : 0; 192 qstats.no_cn = no_cn_flag ? 1 : 0; 193 if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) { 194 printf("ERROR: Could not update map element\n"); 195 goto err; 196 } 197 198 if (!outFlag) 199 bpf_program__set_expected_attach_type(bpf_prog, BPF_CGROUP_INET_INGRESS); 200 201 link = bpf_program__attach_cgroup(bpf_prog, cg1); 202 if (libbpf_get_error(link)) { 203 fprintf(stderr, "ERROR: bpf_program__attach_cgroup failed\n"); 204 goto err; 205 } 206 207 sprintf(cg_pin_path, "/sys/fs/bpf/hbm%d", cg_id); 208 rc = bpf_link__pin(link, cg_pin_path); 209 if (rc < 0) { 210 printf("ERROR: bpf_link__pin failed: %d\n", rc); 211 goto err; 212 } 213 214 if (work_conserving_flag) { 215 struct timeval t0, t_last, t_new; 216 FILE *fin; 217 unsigned long long last_eth_tx_bytes, new_eth_tx_bytes; 218 signed long long last_cg_tx_bytes, new_cg_tx_bytes; 219 signed long long delta_time, delta_bytes, delta_rate; 220 int delta_ms; 221#define DELTA_RATE_CHECK 10000 /* in us */ 222#define RATE_THRESHOLD 9500000000 /* 9.5 Gbps */ 223 224 bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); 225 if (gettimeofday(&t0, NULL) < 0) 226 do_error("gettimeofday failed", true); 227 t_last = t0; 228 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", "r"); 229 if (fscanf(fin, "%llu", &last_eth_tx_bytes) != 1) 230 do_error("fscanf fails", false); 231 fclose(fin); 232 last_cg_tx_bytes = qstats.bytes_total; 233 while (true) { 234 usleep(DELTA_RATE_CHECK); 235 if (gettimeofday(&t_new, NULL) < 0) 236 do_error("gettimeofday failed", true); 237 delta_ms = (t_new.tv_sec - t0.tv_sec) * 1000 + 238 (t_new.tv_usec - t0.tv_usec)/1000; 239 if (delta_ms > dur * 1000) 240 break; 241 delta_time = (t_new.tv_sec - t_last.tv_sec) * 1000000 + 242 (t_new.tv_usec - t_last.tv_usec); 243 if (delta_time == 0) 244 continue; 245 t_last = t_new; 246 fin = fopen("/sys/class/net/eth0/statistics/tx_bytes", 247 "r"); 248 if (fscanf(fin, "%llu", &new_eth_tx_bytes) != 1) 249 do_error("fscanf fails", false); 250 fclose(fin); 251 printf(" new_eth_tx_bytes:%llu\n", 252 new_eth_tx_bytes); 253 bpf_map_lookup_elem(queue_stats_fd, &key, &qstats); 254 new_cg_tx_bytes = qstats.bytes_total; 255 delta_bytes = new_eth_tx_bytes - last_eth_tx_bytes; 256 last_eth_tx_bytes = new_eth_tx_bytes; 257 delta_rate = (delta_bytes * 8000000) / delta_time; 258 printf("%5d - eth_rate:%.1fGbps cg_rate:%.3fGbps", 259 delta_ms, delta_rate/1000000000.0, 260 rate/1000.0); 261 if (delta_rate < RATE_THRESHOLD) { 262 /* can increase cgroup rate limit, but first 263 * check if we are using the current limit. 264 * Currently increasing by 6.25%, unknown 265 * if that is the optimal rate. 266 */ 267 int rate_diff100; 268 269 delta_bytes = new_cg_tx_bytes - 270 last_cg_tx_bytes; 271 last_cg_tx_bytes = new_cg_tx_bytes; 272 delta_rate = (delta_bytes * 8000000) / 273 delta_time; 274 printf(" rate:%.3fGbps", 275 delta_rate/1000000000.0); 276 rate_diff100 = (((long long)rate)*1000000 - 277 delta_rate) * 100 / 278 (((long long) rate) * 1000000); 279 printf(" rdiff:%d", rate_diff100); 280 if (rate_diff100 <= 3) { 281 rate += (rate >> 4); 282 if (rate > RATE_THRESHOLD / 1000000) 283 rate = RATE_THRESHOLD / 1000000; 284 qstats.rate = rate; 285 printf(" INC\n"); 286 } else { 287 printf("\n"); 288 } 289 } else { 290 /* Need to decrease cgroup rate limit. 291 * Currently decreasing by 12.5%, unknown 292 * if that is optimal 293 */ 294 printf(" DEC\n"); 295 rate -= (rate >> 3); 296 if (rate < minRate) 297 rate = minRate; 298 qstats.rate = rate; 299 } 300 if (bpf_map_update_elem(queue_stats_fd, &key, &qstats, BPF_ANY)) 301 do_error("update map element fails", false); 302 } 303 } else { 304 sleep(dur); 305 } 306 // Get stats! 307 if (stats_flag && bpf_map_lookup_elem(queue_stats_fd, &key, &qstats)) { 308 char fname[100]; 309 FILE *fout; 310 311 if (!outFlag) 312 sprintf(fname, "hbm.%d.in", cg_id); 313 else 314 sprintf(fname, "hbm.%d.out", cg_id); 315 fout = fopen(fname, "w"); 316 fprintf(fout, "id:%d\n", cg_id); 317 fprintf(fout, "ERROR: Could not lookup queue_stats\n"); 318 } else if (stats_flag && qstats.lastPacketTime > 319 qstats.firstPacketTime) { 320 long long delta_us = (qstats.lastPacketTime - 321 qstats.firstPacketTime)/1000; 322 unsigned int rate_mbps = ((qstats.bytes_total - 323 qstats.bytes_dropped) * 8 / 324 delta_us); 325 double percent_pkts, percent_bytes; 326 char fname[100]; 327 FILE *fout; 328 int k; 329 static const char *returnValNames[] = { 330 "DROP_PKT", 331 "ALLOW_PKT", 332 "DROP_PKT_CWR", 333 "ALLOW_PKT_CWR" 334 }; 335#define RET_VAL_COUNT 4 336 337// Future support of ingress 338// if (!outFlag) 339// sprintf(fname, "hbm.%d.in", cg_id); 340// else 341 sprintf(fname, "hbm.%d.out", cg_id); 342 fout = fopen(fname, "w"); 343 fprintf(fout, "id:%d\n", cg_id); 344 fprintf(fout, "rate_mbps:%d\n", rate_mbps); 345 fprintf(fout, "duration:%.1f secs\n", 346 (qstats.lastPacketTime - qstats.firstPacketTime) / 347 1000000000.0); 348 fprintf(fout, "packets:%d\n", (int)qstats.pkts_total); 349 fprintf(fout, "bytes_MB:%d\n", (int)(qstats.bytes_total / 350 1000000)); 351 fprintf(fout, "pkts_dropped:%d\n", (int)qstats.pkts_dropped); 352 fprintf(fout, "bytes_dropped_MB:%d\n", 353 (int)(qstats.bytes_dropped / 354 1000000)); 355 // Marked Pkts and Bytes 356 percent_pkts = (qstats.pkts_marked * 100.0) / 357 (qstats.pkts_total + 1); 358 percent_bytes = (qstats.bytes_marked * 100.0) / 359 (qstats.bytes_total + 1); 360 fprintf(fout, "pkts_marked_percent:%6.2f\n", percent_pkts); 361 fprintf(fout, "bytes_marked_percent:%6.2f\n", percent_bytes); 362 363 // Dropped Pkts and Bytes 364 percent_pkts = (qstats.pkts_dropped * 100.0) / 365 (qstats.pkts_total + 1); 366 percent_bytes = (qstats.bytes_dropped * 100.0) / 367 (qstats.bytes_total + 1); 368 fprintf(fout, "pkts_dropped_percent:%6.2f\n", percent_pkts); 369 fprintf(fout, "bytes_dropped_percent:%6.2f\n", percent_bytes); 370 371 // ECN CE markings 372 percent_pkts = (qstats.pkts_ecn_ce * 100.0) / 373 (qstats.pkts_total + 1); 374 fprintf(fout, "pkts_ecn_ce:%6.2f (%d)\n", percent_pkts, 375 (int)qstats.pkts_ecn_ce); 376 377 // Average cwnd 378 fprintf(fout, "avg cwnd:%d\n", 379 (int)(qstats.sum_cwnd / (qstats.sum_cwnd_cnt + 1))); 380 // Average rtt 381 fprintf(fout, "avg rtt:%d\n", 382 (int)(qstats.sum_rtt / (qstats.pkts_total + 1))); 383 // Average credit 384 if (edt_flag) 385 fprintf(fout, "avg credit_ms:%.03f\n", 386 (qstats.sum_credit / 387 (qstats.pkts_total + 1.0)) / 1000000.0); 388 else 389 fprintf(fout, "avg credit:%d\n", 390 (int)(qstats.sum_credit / 391 (1500 * ((int)qstats.pkts_total ) + 1))); 392 393 // Return values stats 394 for (k = 0; k < RET_VAL_COUNT; k++) { 395 percent_pkts = (qstats.returnValCount[k] * 100.0) / 396 (qstats.pkts_total + 1); 397 fprintf(fout, "%s:%6.2f (%d)\n", returnValNames[k], 398 percent_pkts, (int)qstats.returnValCount[k]); 399 } 400 fclose(fout); 401 } 402 403 if (debugFlag) 404 read_trace_pipe2(); 405 goto cleanup; 406 407err: 408 rc = 1; 409 410cleanup: 411 bpf_link__destroy(link); 412 bpf_object__close(obj); 413 414 if (cg1 != -1) 415 close(cg1); 416 417 if (rc != 0) 418 cleanup_cgroup_environment(); 419 return rc; 420} 421 422static void Usage(void) 423{ 424 printf("This program loads a cgroup skb BPF program to enforce\n" 425 "cgroup output (egress) bandwidth limits.\n\n" 426 "USAGE: hbm [-o] [-d] [-l] [-n <id>] [--no_cn] [-r <rate>]\n" 427 " [-s] [-t <secs>] [-w] [-h] [prog]\n" 428 " Where:\n" 429 " -o indicates egress direction (default)\n" 430 " -d print BPF trace debug buffer\n" 431 " --edt use fq's Earliest Departure Time\n" 432 " -l also limit flows using loopback\n" 433 " -n <#> to create cgroup \"/hbm#\" and attach prog\n" 434 " Default is /hbm1\n" 435 " --no_cn disable CN notifications\n" 436 " -r <rate> Rate in Mbps\n" 437 " -s Update HBM stats\n" 438 " -t <time> Exit after specified seconds (default is 0)\n" 439 " -w Work conserving flag. cgroup can increase\n" 440 " bandwidth beyond the rate limit specified\n" 441 " while there is available bandwidth. Current\n" 442 " implementation assumes there is only eth0\n" 443 " but can be extended to support multiple NICs\n" 444 " -h print this info\n" 445 " prog BPF program file name. Name defaults to\n" 446 " hbm_out_kern.o\n"); 447} 448 449int main(int argc, char **argv) 450{ 451 char *prog = "hbm_out_kern.o"; 452 int k; 453 int cg_id = 1; 454 char *optstring = "iodln:r:st:wh"; 455 struct option loptions[] = { 456 {"no_cn", 0, NULL, 1}, 457 {"edt", 0, NULL, 2}, 458 {NULL, 0, NULL, 0} 459 }; 460 461 while ((k = getopt_long(argc, argv, optstring, loptions, NULL)) != -1) { 462 switch (k) { 463 case 1: 464 no_cn_flag = true; 465 break; 466 case 2: 467 prog = "hbm_edt_kern.o"; 468 edt_flag = true; 469 break; 470 case'o': 471 break; 472 case 'd': 473 debugFlag = true; 474 break; 475 case 'l': 476 loopback_flag = true; 477 break; 478 case 'n': 479 cg_id = atoi(optarg); 480 break; 481 case 'r': 482 minRate = atoi(optarg) * 1.024; 483 rate = minRate; 484 break; 485 case 's': 486 stats_flag = true; 487 break; 488 case 't': 489 dur = atoi(optarg); 490 break; 491 case 'w': 492 work_conserving_flag = true; 493 break; 494 case '?': 495 if (optopt == 'n' || optopt == 'r' || optopt == 't') 496 fprintf(stderr, 497 "Option -%c requires an argument.\n\n", 498 optopt); 499 case 'h': 500 __fallthrough; 501 default: 502 Usage(); 503 return 0; 504 } 505 } 506 507 if (optind < argc) 508 prog = argv[optind]; 509 printf("HBM prog: %s\n", prog != NULL ? prog : "NULL"); 510 511 /* Use libbpf 1.0 API mode */ 512 libbpf_set_strict_mode(LIBBPF_STRICT_ALL); 513 514 return run_bpf_prog(prog, cg_id); 515}