# Copyright (c) Facebook, Inc. and its affiliates.
#
# This source code is licensed under the MIT license found in the
# LICENSE file in the root directory of this source tree.
"""This module defines a utility function for constructing LLVM benchmarks."""
import logging
import os
import random
import subprocess
import sys
import tempfile
from concurrent.futures import as_completed
from datetime import datetime
from pathlib import Path
from signal import Signals
from typing import Iterable, List, Optional, Union
from compiler_gym.datasets import Benchmark, BenchmarkInitError
from compiler_gym.third_party import llvm
from compiler_gym.util.runfiles_path import transient_cache_path
from compiler_gym.util.thread_pool import get_thread_pool_executor
def _communicate(process, input=None, timeout=None):
"""subprocess.communicate() which kills subprocess on timeout."""
try:
return process.communicate(input=input, timeout=timeout)
except subprocess.TimeoutExpired:
# kill() was added in Python 3.7.
if sys.version_info >= (3, 7, 0):
process.kill()
else:
process.terminate()
raise
def get_compiler_includes(compiler: str) -> Iterable[Path]:
"""Run the system compiler in verbose mode on a dummy input to get the
system header search path.
"""
# Create a temporary directory to write the compiled 'binary' to, since
# GNU assembler does not support piping to stdout.
with tempfile.TemporaryDirectory() as d:
try:
process = subprocess.Popen(
[compiler, "-xc++", "-v", "-c", "-", "-o", str(Path(d) / "a.out")],
stdout=subprocess.DEVNULL,
stderr=subprocess.PIPE,
stdin=subprocess.PIPE,
universal_newlines=True,
)
except FileNotFoundError as e:
raise OSError(
f"Failed to invoke {compiler}. "
f"Is there a working system compiler?\n"
f"Error: {e}"
) from e
_, stderr = _communicate(process, input="", timeout=30)
if process.returncode:
raise OSError(
f"Failed to invoke {compiler}. "
f"Is there a working system compiler?\n"
f"Error: {stderr.strip()}"
)
# Parse the compiler output that matches the conventional output format
# used by clang and GCC:
#
# #include <...> search starts here:
# /path/1
# /path/2
# End of search list
in_search_list = False
for line in stderr.split("\n"):
if in_search_list and line.startswith("End of search list"):
break
elif in_search_list:
# We have an include path to return.
path = Path(line.strip())
yield path
# Compatibility fix for compiling benchmark sources which use the
# '#include <endian.h>' header, which on macOS is located in a
# 'machine/endian.h' directory.
if (path / "machine").is_dir():
yield path / "machine"
elif line.startswith("#include <...> search starts here:"):
in_search_list = True
else:
msg = f"Failed to parse '#include <...>' search paths from {compiler}"
stderr = stderr.strip()
if stderr:
msg += f":\n{stderr}"
raise OSError(msg)
# Memoized search paths. Call get_system_includes() to access them.
_SYSTEM_INCLUDES = None
[docs]def get_system_includes() -> List[Path]:
"""Determine the system include paths for C/C++ compilation jobs.
This uses the system compiler to determine the search paths for C/C++ system
headers. By default, :code:`c++` is invoked. This can be overridden by
setting :code:`os.environ["CXX"]`.
:return: A list of paths to system header directories.
:raises OSError: If the compiler fails, or if the search paths cannot be
determined.
"""
# Memoize the system includes paths.
global _SYSTEM_INCLUDES
if _SYSTEM_INCLUDES is None:
system_compiler = os.environ.get("CXX", "c++")
try:
_SYSTEM_INCLUDES = list(get_compiler_includes(system_compiler))
except OSError as e:
logging.warning("%s", e)
_SYSTEM_INCLUDES = []
return _SYSTEM_INCLUDES
[docs]class ClangInvocation:
"""Class to represent a single invocation of the clang compiler."""
[docs] def __init__(
self, args: List[str], system_includes: bool = True, timeout: int = 600
):
"""Create a clang invocation.
:param args: The list of arguments to pass to clang.
:param system_includes: Whether to include the system standard libraries
during compilation jobs. This requires a system toolchain. See
:func:`get_system_includes`.
:param timeout: The maximum number of seconds to allow clang to run
before terminating.
"""
self.args = args
self.system_includes = system_includes
self.timeout = timeout
def command(self, outpath: Path) -> List[str]:
cmd = [str(llvm.clang_path())]
if self.system_includes:
for directory in get_system_includes():
cmd += ["-isystem", str(directory)]
cmd += [str(s) for s in self.args]
cmd += ["-c", "-emit-llvm", "-o", str(outpath)]
return cmd
@classmethod
def from_c_file(
cls,
path: Path,
copt: Optional[List[str]] = None,
system_includes: bool = True,
timeout: int = 600,
) -> "ClangInvocation":
copt = copt or []
# NOTE(cummins): There is some discussion about the best way to create a
# bitcode that is unoptimized yet does not hinder downstream
# optimization opportunities. Here we are using a configuration based on
# -O1 in which we prevent the -O1 optimization passes from running. This
# is because LLVM produces different function attributes dependening on
# the optimization level. E.g. "-O0 -Xclang -disable-llvm-optzns -Xclang
# -disable-O0-optnone" will generate code with "noinline" attributes set
# on the functions, wheras "-Oz -Xclang -disable-llvm-optzns" will
# generate functions with "minsize" and "optsize" attributes set.
#
# See also:
# <https://lists.llvm.org/pipermail/llvm-dev/2018-August/thread.html#125365>
# <https://github.com/facebookresearch/CompilerGym/issues/110>
DEFAULT_COPT = [
"-O1",
"-Xclang",
"-disable-llvm-passes",
"-Xclang",
"-disable-llvm-optzns",
]
return cls(
DEFAULT_COPT + copt + [str(path)],
system_includes=system_includes,
timeout=timeout,
)
def _run_command(cmd: List[str], timeout: int):
process = subprocess.Popen(
cmd, stdout=subprocess.DEVNULL, stderr=subprocess.PIPE, universal_newlines=True
)
_, stderr = _communicate(process, timeout=timeout)
if process.returncode:
returncode = process.returncode
try:
# Try and decode the name of a signal. Signal returncodes
# are negative.
returncode = f"{returncode} ({Signals(abs(returncode)).name})"
except ValueError:
pass
raise BenchmarkInitError(
f"Compilation job failed with returncode {returncode}\n"
f"Command: {' '.join(cmd)}\n"
f"Stderr: {stderr.strip()}"
)
[docs]def make_benchmark(
inputs: Union[str, Path, ClangInvocation, List[Union[str, Path, ClangInvocation]]],
copt: Optional[List[str]] = None,
system_includes: bool = True,
timeout: int = 600,
) -> Benchmark:
"""Create a benchmark for use by LLVM environments.
This function takes one or more inputs and uses them to create a benchmark
that can be passed to :meth:`compiler_gym.envs.LlvmEnv.reset`.
For single-source C/C++ programs, you can pass the path of the source file:
>>> benchmark = make_benchmark('my_app.c')
>>> env = gym.make("llvm-v0")
>>> env.reset(benchmark=benchmark)
The clang invocation used is roughly equivalent to:
.. code-block::
$ clang my_app.c -O0 -c -emit-llvm -o benchmark.bc
Additional compile-time arguments to clang can be provided using the
:code:`copt` argument:
>>> benchmark = make_benchmark('/path/to/my_app.cpp', copt=['-O2'])
If you need more fine-grained control over the options, you can directly
construct a :class:`ClangInvocation
<compiler_gym.envs.llvm.ClangInvocation>` to pass a list of arguments to
clang:
>>> benchmark = make_benchmark(
ClangInvocation(['/path/to/my_app.c'], timeout=10)
)
For multi-file programs, pass a list of inputs that will be compiled
separately and then linked to a single module:
>>> benchmark = make_benchmark([
'main.c',
'lib.cpp',
'lib2.bc',
])
If you already have prepared bitcode files, those can be linked and used
directly:
>>> benchmark = make_benchmark([
'bitcode1.bc',
'bitcode2.bc',
])
Text-format LLVM assembly can also be used:
>>> benchmark = make_benchmark('module.ll')
.. note::
LLVM bitcode compatibility is
`not guaranteed <https://llvm.org/docs/DeveloperPolicy.html#ir-backwards-compatibility>`_,
so you must ensure that any precompiled bitcodes are compatible with the
LLVM version used by CompilerGym, which can be queried using
:func:`env.compiler_version <compiler_gym.envs.CompilerEnv.compiler_version>`.
:param inputs: An input, or list of inputs.
:param copt: A list of command line options to pass to clang when compiling
source files.
:param system_includes: Whether to include the system standard libraries
during compilation jobs. This requires a system toolchain. See
:func:`get_system_includes`.
:param timeout: The maximum number of seconds to allow clang to run before
terminating.
:return: A :code:`Benchmark` instance.
:raises FileNotFoundError: If any input sources are not found.
:raises TypeError: If the inputs are of unsupported types.
:raises OSError: If a compilation job fails.
:raises TimeoutExpired: If a compilation job exceeds :code:`timeout`
seconds.
"""
copt = copt or []
bitcodes: List[Path] = []
clang_jobs: List[ClangInvocation] = []
ll_paths: List[Path] = []
def _add_path(path: Path):
if not path.is_file():
raise FileNotFoundError(path)
if path.suffix == ".bc":
bitcodes.append(path)
elif path.suffix in {".c", ".cxx", ".cpp", ".cc"}:
clang_jobs.append(
ClangInvocation.from_c_file(
path, copt=copt, system_includes=system_includes, timeout=timeout
)
)
elif path.suffix == ".ll":
ll_paths.append(path)
else:
raise ValueError(f"Unrecognized file type: {path.name}")
# Determine from inputs the list of pre-compiled bitcodes and the clang
# invocations required to compile the bitcodes.
if isinstance(inputs, str) or isinstance(inputs, Path):
_add_path(Path(inputs))
elif isinstance(inputs, ClangInvocation):
clang_jobs.append(inputs)
else:
for input in inputs:
if isinstance(input, str) or isinstance(input, Path):
_add_path(Path(input))
elif isinstance(input, ClangInvocation):
clang_jobs.append(input)
else:
raise TypeError(f"Invalid input type: {type(input).__name__}")
# Shortcut if we only have a single pre-compiled bitcode.
if len(bitcodes) == 1 and not clang_jobs:
bitcode = bitcodes[0]
return Benchmark.from_file(uri=f"file:///{bitcode}", path=bitcode)
tmpdir_root = transient_cache_path(".")
tmpdir_root.mkdir(exist_ok=True, parents=True)
with tempfile.TemporaryDirectory(
dir=tmpdir_root, prefix="llvm-make_benchmark-"
) as d:
working_dir = Path(d)
clang_outs = [
working_dir / f"clang-out-{i}.bc" for i in range(1, len(clang_jobs) + 1)
]
llvm_as_outs = [
working_dir / f"llvm-as-out-{i}.bc" for i in range(1, len(ll_paths) + 1)
]
# Run the clang and llvm-as invocations in parallel. Avoid running this
# code path if possible as get_thread_pool_executor() requires locking.
if clang_jobs or ll_paths:
llvm_as_path = str(llvm.llvm_as_path())
executor = get_thread_pool_executor()
llvm_as_commands = [
[llvm_as_path, str(ll_path), "-o", bc_path]
for ll_path, bc_path in zip(ll_paths, llvm_as_outs)
]
# Fire off the clang and llvm-as jobs.
futures = [
executor.submit(_run_command, job.command(out), job.timeout)
for job, out in zip(clang_jobs, clang_outs)
] + [
executor.submit(_run_command, command, timeout)
for command in llvm_as_commands
]
# Block until finished.
list(future.result() for future in as_completed(futures))
# Check that the expected files were generated.
for clang_job, bc_path in zip(clang_jobs, clang_outs):
if not bc_path.is_file():
raise BenchmarkInitError(
f"clang failed: {' '.join(clang_job.command(bc_path))}"
)
for command, bc_path in zip(llvm_as_commands, llvm_as_outs):
if not bc_path.is_file():
raise BenchmarkInitError(f"llvm-as failed: {command}")
all_outs = bitcodes + clang_outs + llvm_as_outs
if not all_outs:
raise ValueError("No inputs")
elif len(all_outs) == 1:
# We only have a single bitcode so read it.
with open(str(all_outs[0]), "rb") as f:
bitcode = f.read()
else:
# Link all of the bitcodes into a single module.
llvm_link_cmd = [str(llvm.llvm_link_path()), "-o", "-"] + [
str(path) for path in bitcodes + clang_outs
]
llvm_link = subprocess.Popen(
llvm_link_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE
)
bitcode, stderr = _communicate(llvm_link, timeout=timeout)
if llvm_link.returncode:
raise BenchmarkInitError(
f"Failed to link LLVM bitcodes with error: {stderr.decode('utf-8')}"
)
timestamp = datetime.now().strftime("%Y%m%HT%H%M%S")
uri = f"benchmark://user/{timestamp}-{random.randrange(16**4):04x}"
return Benchmark.from_file_contents(uri, bitcode)