@inproceedings{314a6ca077d74720903c94efdc3fbc83,
title = "On the Potential of LLMs for Offensive Security: Benchmarks vs. Operational Reality",
abstract = "Large Language Models (LLMs), through their strong capabilities in code generation, reasoning, and tool use, have demonstrated promising results in security tasks involving vulnerability discovery and exploitation. However, evaluating their offensive potential in automating penetration testing - a more complex and multi-stage process - remains a critical research challenge. While existing evaluation frameworks effectively demonstrate LLM capabilities in isolated or simplified scenarios, they often do not extend toward the complexity of interconnected attack chains characteristic of real-world adversarial operations. In this analytical study, we examine the challenge of assessing the feasibility of LLM-powered automation across the full adversarial pipeline within realistic environments. We contribute an analysis of current benchmarks and associated environments, and highlight opportunities for methodological enhancements that would strengthen alignment between academic evaluations and operational realities.",
keywords = "Benchmark, Cyber Kill Chain, LLM, MITRE ATT\&CK, offensive security, penetration testing, red teaming",
author = "Ruben Missotten and Vera Rimmer and Wim Mees and Lieven Desmet",
note = "Publisher Copyright: {\textcopyright} 2025 IEEE.; 2025 Annual Computer Security Applications Conference Workshops, ACSACW 2025 ; Conference date: 08-12-2025 Through 12-12-2025",
year = "2025",
doi = "10.1109/ACSACW69556.2025.00052",
language = "English",
series = "Proceedings - 2025 Annual Computer Security Applications Conference Workshops, ACSACW 2025",
publisher = "Institute of Electrical and Electronics Engineers Inc.",
pages = "420--427",
booktitle = "Proceedings - 2025 Annual Computer Security Applications Conference Workshops, ACSACW 2025",
}