diff --git a/.buildkite/check-wheel-size.py b/.buildkite/check-wheel-size.py index a378bc6baa..e29881fcba 100644 --- a/.buildkite/check-wheel-size.py +++ b/.buildkite/check-wheel-size.py @@ -8,12 +8,12 @@ import zipfile # Note that we have 400 MiB quota, please use it wisely. # See https://github.com/pypi/support/issues/3792 . # Please also sync the value with the one in Dockerfile. -VLLM_MAX_SIZE_MB = int(os.environ.get('VLLM_MAX_SIZE_MB', 400)) +VLLM_MAX_SIZE_MB = int(os.environ.get("VLLM_MAX_SIZE_MB", 400)) def print_top_10_largest_files(zip_file): """Print the top 10 largest files in the given zip file.""" - with zipfile.ZipFile(zip_file, 'r') as z: + with zipfile.ZipFile(zip_file, "r") as z: file_sizes = [(f, z.getinfo(f).file_size) for f in z.namelist()] file_sizes.sort(key=lambda x: x[1], reverse=True) for f, size in file_sizes[:10]: @@ -28,14 +28,18 @@ def check_wheel_size(directory): wheel_path = os.path.join(root, file_name) wheel_size_mb = os.path.getsize(wheel_path) / (1024 * 1024) if wheel_size_mb > VLLM_MAX_SIZE_MB: - print(f"Not allowed: Wheel {wheel_path} is larger " - f"({wheel_size_mb:.2f} MB) than the limit " - f"({VLLM_MAX_SIZE_MB} MB).") + print( + f"Not allowed: Wheel {wheel_path} is larger " + f"({wheel_size_mb:.2f} MB) than the limit " + f"({VLLM_MAX_SIZE_MB} MB)." + ) print_top_10_largest_files(wheel_path) return 1 else: - print(f"Wheel {wheel_path} is within the allowed size " - f"({wheel_size_mb:.2f} MB).") + print( + f"Wheel {wheel_path} is within the allowed size " + f"({wheel_size_mb:.2f} MB)." + ) return 0 @@ -45,4 +49,4 @@ if __name__ == "__main__": sys.exit(1) directory = sys.argv[1] - sys.exit(check_wheel_size(directory)) \ No newline at end of file + sys.exit(check_wheel_size(directory)) diff --git a/.buildkite/generate_index.py b/.buildkite/generate_index.py index 36e1b6c013..270663c415 100644 --- a/.buildkite/generate_index.py +++ b/.buildkite/generate_index.py @@ -22,5 +22,5 @@ with open("index.html", "w") as f: print(f"Generated index.html for {args.wheel}") # cloudfront requires escaping the '+' character f.write( - template.format(wheel=filename, - wheel_html_escaped=filename.replace("+", "%2B"))) + template.format(wheel=filename, wheel_html_escaped=filename.replace("+", "%2B")) + ) diff --git a/.buildkite/lm-eval-harness/conftest.py b/.buildkite/lm-eval-harness/conftest.py index a0bcc993ed..769d2efda4 100644 --- a/.buildkite/lm-eval-harness/conftest.py +++ b/.buildkite/lm-eval-harness/conftest.py @@ -8,11 +8,14 @@ def pytest_addoption(parser): parser.addoption( "--config-list-file", action="store", - help="Path to the file listing model config YAMLs (one per line)") - parser.addoption("--tp-size", - action="store", - default="1", - help="Tensor parallel size to use for evaluation") + help="Path to the file listing model config YAMLs (one per line)", + ) + parser.addoption( + "--tp-size", + action="store", + default="1", + help="Tensor parallel size to use for evaluation", + ) @pytest.fixture(scope="session") @@ -33,7 +36,8 @@ def pytest_generate_tests(metafunc): config_dir = config_list_file.parent with open(config_list_file, encoding="utf-8") as f: configs = [ - config_dir / line.strip() for line in f + config_dir / line.strip() + for line in f if line.strip() and not line.startswith("#") ] metafunc.parametrize("config_filename", configs) diff --git a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py index c5411daf0d..409a6ca820 100644 --- a/.buildkite/lm-eval-harness/test_lm_eval_correctness.py +++ b/.buildkite/lm-eval-harness/test_lm_eval_correctness.py @@ -16,19 +16,22 @@ RTOL = 0.08 def launch_lm_eval(eval_config, tp_size): - trust_remote_code = eval_config.get('trust_remote_code', False) - model_args = f"pretrained={eval_config['model_name']}," \ - f"tensor_parallel_size={tp_size}," \ - f"enforce_eager=true," \ - f"add_bos_token=true," \ - f"trust_remote_code={trust_remote_code}" + trust_remote_code = eval_config.get("trust_remote_code", False) + model_args = ( + f"pretrained={eval_config['model_name']}," + f"tensor_parallel_size={tp_size}," + f"enforce_eager=true," + f"add_bos_token=true," + f"trust_remote_code={trust_remote_code}" + ) results = lm_eval.simple_evaluate( model="vllm", model_args=model_args, tasks=[task["name"] for task in eval_config["tasks"]], num_fewshot=eval_config["num_fewshot"], limit=eval_config["limit"], - batch_size="auto") + batch_size="auto", + ) return results @@ -42,9 +45,10 @@ def test_lm_eval_correctness_param(config_filename, tp_size): for metric in task["metrics"]: ground_truth = metric["value"] measured_value = results["results"][task["name"]][metric["name"]] - print(f'{task["name"]} | {metric["name"]}: ' - f'ground_truth={ground_truth} | measured={measured_value}') - success = success and np.isclose( - ground_truth, measured_value, rtol=RTOL) + print( + f"{task['name']} | {metric['name']}: " + f"ground_truth={ground_truth} | measured={measured_value}" + ) + success = success and np.isclose(ground_truth, measured_value, rtol=RTOL) assert success diff --git a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py index 1030ec24e8..7f2a2d8dc2 100644 --- a/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/convert-results-json-to-markdown.py @@ -65,18 +65,18 @@ def read_markdown(file): def results_to_json(latency, throughput, serving): - return json.dumps({ - 'latency': latency.to_dict(), - 'throughput': throughput.to_dict(), - 'serving': serving.to_dict() - }) + return json.dumps( + { + "latency": latency.to_dict(), + "throughput": throughput.to_dict(), + "serving": serving.to_dict(), + } + ) if __name__ == "__main__": - # collect results for test_file in results_folder.glob("*.json"): - with open(test_file) as f: raw_result = json.loads(f.read()) @@ -120,7 +120,8 @@ if __name__ == "__main__": for perc in [10, 25, 50, 75, 90, 99]: # Multiply 1000 to convert the time unit from s to ms raw_result.update( - {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]}) + {f"P{perc}": 1000 * raw_result["percentiles"][str(perc)]} + ) raw_result["avg_latency"] = raw_result["avg_latency"] * 1000 # add the result to raw_result @@ -153,26 +154,27 @@ if __name__ == "__main__": serving_results = pd.DataFrame.from_dict(serving_results) throughput_results = pd.DataFrame.from_dict(throughput_results) - raw_results_json = results_to_json(latency_results, throughput_results, - serving_results) + raw_results_json = results_to_json( + latency_results, throughput_results, serving_results + ) # remapping the key, for visualization purpose if not latency_results.empty: - latency_results = latency_results[list( - latency_column_mapping.keys())].rename( - columns=latency_column_mapping) + latency_results = latency_results[list(latency_column_mapping.keys())].rename( + columns=latency_column_mapping + ) if not serving_results.empty: - serving_results = serving_results[list( - serving_column_mapping.keys())].rename( - columns=serving_column_mapping) + serving_results = serving_results[list(serving_column_mapping.keys())].rename( + columns=serving_column_mapping + ) if not throughput_results.empty: - throughput_results = throughput_results[list( - throughput_results_column_mapping.keys())].rename( - columns=throughput_results_column_mapping) + throughput_results = throughput_results[ + list(throughput_results_column_mapping.keys()) + ].rename(columns=throughput_results_column_mapping) - processed_results_json = results_to_json(latency_results, - throughput_results, - serving_results) + processed_results_json = results_to_json( + latency_results, throughput_results, serving_results + ) for df in [latency_results, serving_results, throughput_results]: if df.empty: @@ -184,38 +186,39 @@ if __name__ == "__main__": # The GPUs sometimes come in format of "GPUTYPE\nGPUTYPE\n...", # we want to turn it into "8xGPUTYPE" df["GPU"] = df["GPU"].apply( - lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}") + lambda x: f"{len(x.split('\n'))}x{x.split('\n')[0]}" + ) # get markdown tables - latency_md_table = tabulate(latency_results, - headers='keys', - tablefmt='pipe', - showindex=False) - serving_md_table = tabulate(serving_results, - headers='keys', - tablefmt='pipe', - showindex=False) - throughput_md_table = tabulate(throughput_results, - headers='keys', - tablefmt='pipe', - showindex=False) + latency_md_table = tabulate( + latency_results, headers="keys", tablefmt="pipe", showindex=False + ) + serving_md_table = tabulate( + serving_results, headers="keys", tablefmt="pipe", showindex=False + ) + throughput_md_table = tabulate( + throughput_results, headers="keys", tablefmt="pipe", showindex=False + ) # document the result with open(results_folder / "benchmark_results.md", "w") as f: - - results = read_markdown("../.buildkite/nightly-benchmarks/" + - "performance-benchmarks-descriptions.md") + results = read_markdown( + "../.buildkite/nightly-benchmarks/" + + "performance-benchmarks-descriptions.md" + ) results = results.format( latency_tests_markdown_table=latency_md_table, throughput_tests_markdown_table=throughput_md_table, serving_tests_markdown_table=serving_md_table, - benchmarking_results_in_json_string=processed_results_json) + benchmarking_results_in_json_string=processed_results_json, + ) f.write(results) # document benchmarking results in json with open(results_folder / "benchmark_results.json", "w") as f: - - results = latency_results.to_dict( - orient='records') + throughput_results.to_dict( - orient='records') + serving_results.to_dict(orient='records') + results = ( + latency_results.to_dict(orient="records") + + throughput_results.to_dict(orient="records") + + serving_results.to_dict(orient="records") + ) f.write(json.dumps(results)) diff --git a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py index 5e17b79d26..778a3a8d87 100644 --- a/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py +++ b/.buildkite/nightly-benchmarks/scripts/download-tokenizer.py @@ -14,15 +14,12 @@ def main(model, cachedir): if __name__ == "__main__": parser = argparse.ArgumentParser( - description="Download and save Hugging Face tokenizer") - parser.add_argument("--model", - type=str, - required=True, - help="Name of the model") - parser.add_argument("--cachedir", - type=str, - required=True, - help="Directory to save the tokenizer") + description="Download and save Hugging Face tokenizer" + ) + parser.add_argument("--model", type=str, required=True, help="Name of the model") + parser.add_argument( + "--cachedir", type=str, required=True, help="Directory to save the tokenizer" + ) args = parser.parse_args() main(args.model, args.cachedir) diff --git a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py index 0ff95a0911..10a7a2f5a4 100644 --- a/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py +++ b/.buildkite/nightly-benchmarks/scripts/generate-nightly-markdown.py @@ -11,33 +11,33 @@ from tabulate import tabulate def parse_arguments(): parser = argparse.ArgumentParser( - description= - 'Parse command line arguments for summary-nightly-results script.') - parser.add_argument('--results-folder', - type=str, - required=True, - help='The folder where the results are stored.') - parser.add_argument('--description', - type=str, - required=True, - help='Description of the results.') + description="Parse command line arguments for summary-nightly-results script." + ) + parser.add_argument( + "--results-folder", + type=str, + required=True, + help="The folder where the results are stored.", + ) + parser.add_argument( + "--description", type=str, required=True, help="Description of the results." + ) args = parser.parse_args() return args def get_perf(df, method, model, metric): - means = [] for qps in [2, 4, 8, 16, "inf"]: - target = df['Test name'].str.contains(model) - target = target & df['Engine'].str.contains(method) - target = target & df['Test name'].str.contains("qps_" + str(qps)) + target = df["Test name"].str.contains(model) + target = target & df["Engine"].str.contains(method) + target = target & df["Test name"].str.contains("qps_" + str(qps)) filtered_df = df[target] if filtered_df.empty: - means.append(0.) + means.append(0.0) else: means.append(filtered_df[metric].values[0]) @@ -45,7 +45,6 @@ def get_perf(df, method, model, metric): def get_perf_w_std(df, method, model, metric): - if metric in ["TTFT", "ITL"]: mean = get_perf(df, method, model, "Mean " + metric + " (ms)") mean = mean.tolist() @@ -60,7 +59,8 @@ def get_perf_w_std(df, method, model, metric): else: assert metric == "Tput" mean = get_perf(df, method, model, "Input Tput (tok/s)") + get_perf( - df, method, model, "Output Tput (tok/s)") + df, method, model, "Output Tput (tok/s)" + ) mean = mean.tolist() std = None @@ -80,18 +80,17 @@ def main(args): # generate markdown table df = pd.DataFrame.from_dict(results) - md_table = tabulate(df, headers='keys', tablefmt='pipe', showindex=False) + md_table = tabulate(df, headers="keys", tablefmt="pipe", showindex=False) with open(args.description) as f: description = f.read() - description = description.format( - nightly_results_benchmarking_table=md_table) + description = description.format(nightly_results_benchmarking_table=md_table) with open("nightly_results.md", "w") as f: f.write(description) -if __name__ == '__main__': +if __name__ == "__main__": args = parse_arguments() main(args) diff --git a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py index 62ee5e10b5..2a7b37991f 100644 --- a/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py +++ b/.buildkite/nightly-benchmarks/scripts/summary-nightly-results.py @@ -34,10 +34,8 @@ serving_column_mapping = { } if __name__ == "__main__": - # collect results for test_file in results_folder.glob("*.json"): - with open(test_file) as f: raw_result = json.loads(f.read()) @@ -56,17 +54,16 @@ if __name__ == "__main__": serving_results = pd.DataFrame.from_dict(serving_results) if not serving_results.empty: - serving_results = serving_results[list( - serving_column_mapping.keys())].rename( - columns=serving_column_mapping) + serving_results = serving_results[list(serving_column_mapping.keys())].rename( + columns=serving_column_mapping + ) - serving_md_table_with_headers = tabulate(serving_results, - headers='keys', - tablefmt='pipe', - showindex=False) + serving_md_table_with_headers = tabulate( + serving_results, headers="keys", tablefmt="pipe", showindex=False + ) # remove the first line of header - serving_md_table_lines = serving_md_table_with_headers.split('\n') - serving_md_table_without_header = '\n'.join(serving_md_table_lines[2:]) + serving_md_table_lines = serving_md_table_with_headers.split("\n") + serving_md_table_without_header = "\n".join(serving_md_table_lines[2:]) prefix = datetime.datetime.now().strftime("%Y-%m-%d_%H-%M-%S") prefix = prefix + "_" + os.environ.get("CURRENT_LLM_SERVING_ENGINE") @@ -76,10 +73,9 @@ if __name__ == "__main__": # document results with header. # for those who wants to reproduce our benchmark. f.write(serving_md_table_with_headers) - f.write('\n') + f.write("\n") # document benchmarking results in json with open(results_folder / f"{prefix}_nightly_results.json", "w") as f: - - results = serving_results.to_dict(orient='records') + results = serving_results.to_dict(orient="records") f.write(json.dumps(results)) diff --git a/.buildkite/pyproject.toml b/.buildkite/pyproject.toml new file mode 100644 index 0000000000..6ae0c2a399 --- /dev/null +++ b/.buildkite/pyproject.toml @@ -0,0 +1,55 @@ +# This local pyproject file is part of the migration from yapf to ruff format. +# It uses the same core rules as the main pyproject.toml file, but with the +# following differences: +# - isort profile is set to black +# - ruff line length is overridden to 88 +# - deprecated typing ignores (UP006, UP035) have been removed + +[tool.isort] +profile = "black" + +[tool.ruff] +line-length = 88 +exclude = [ + # External file, leaving license intact + "examples/other/fp8/quantizer/quantize.py", + "vllm/vllm_flash_attn/flash_attn_interface.pyi" +] + +[tool.ruff.lint.per-file-ignores] +"vllm/third_party/**" = ["ALL"] +"vllm/version.py" = ["F401"] +"vllm/_version.py" = ["ALL"] + +[tool.ruff.lint] +select = [ + # pycodestyle + "E", + # Pyflakes + "F", + # pyupgrade + "UP", + # flake8-bugbear + "B", + # flake8-simplify + "SIM", + # isort + "I", + # flake8-logging-format + "G", +] +ignore = [ + # star imports + "F405", "F403", + # lambda expression assignment + "E731", + # Loop control variable not used within loop body + "B007", + # f-string format + "UP032", + # Can remove once 3.10+ is the minimum Python version + "UP007", +] + +[tool.ruff.format] +docstring-code-format = true diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 3dc06952c0..23f83db010 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -16,6 +16,8 @@ repos: hooks: - id: ruff args: [--output-format, github, --fix] + - id: ruff-format + files: ^(.buildkite).* - repo: https://github.com/codespell-project/codespell rev: v2.4.1 hooks: @@ -26,6 +28,8 @@ repos: rev: 6.0.1 hooks: - id: isort + # necessary during the transition from yapf to ruff format + args: [--resolve-all-configs, --config-root, .] - repo: https://github.com/pre-commit/mirrors-clang-format rev: v20.1.3 hooks: diff --git a/pyproject.toml b/pyproject.toml index 4147b6bdee..0393bb1ed2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -53,6 +53,7 @@ include = ["vllm*"] [tool.yapfignore] ignore_patterns = [ + ".buildkite/**", "build/**", ] @@ -107,6 +108,7 @@ select = [ "SIM", # isort # "I", + # flake8-logging-format "G", ] ignore = [