diff --git a/swe_bench/make_datasets/__init__.py b/swe_bench/make_datasets/__init__.py new file mode 100644 index 0000000000..f12b94354a --- /dev/null +++ b/swe_bench/make_datasets/__init__.py @@ -0,0 +1,3 @@ +# -*- coding: utf-8 -*- +# @Author : stellahong (stellahong@fuzhi.ai) +# @Desc : diff --git a/swe_bench/make_datasets/make_dataset.py b/swe_bench/make_datasets/make_dataset.py new file mode 100644 index 0000000000..ee4fc8c419 --- /dev/null +++ b/swe_bench/make_datasets/make_dataset.py @@ -0,0 +1,61 @@ +# -*- coding: utf-8 -*- +# @Author : stellahong (stellahong@fuzhi.ai) +# @Desc : +import os +from pathlib import Path + +from tqdm.auto import tqdm + +from data.inference.const import TESTBED +from metagpt.logs import logger +from swe_bench.make_datasets.make_instance import prompt_style_2_edits_only +from swe_bench.utils.parse_diff import filter_changed_line +from swe_bench.utils.repo_utils import EnvManager + + +def reset_task_env(instance: dict = {}): + # reset the env via git reset and git checkout + env_manager = EnvManager(testbed=TESTBED) + + patch = instance["patch"] + repo = instance["repo"] + instance["version"] + repo_prefix = repo.replace("/", "__") + repo_path = os.path.join(env_manager.testbed, repo_prefix) + + if not os.path.exists(repo_path): + return patch, repo, None + os.chdir(repo_path) + if not env_manager.reset_task_env(instance=instance): + return patch, repo, None + + return patch, repo, repo_path + + +def reset_and_copy(instance: dict = {}): + patch, repo, repo_path = reset_task_env(instance) + if repo_path is None: + return + env_manager = EnvManager(testbed=TESTBED) + repo_prefix = repo.replace("/", "__") + version = instance["version"] + destination_path = os.path.join(repo_path, f"{repo_prefix}__{version}") + env_manager.copy_repo(source_path=repo_path, destination_path=destination_path) + + +def make_oracle_collapsed_instance(instance): + # for each instance, reset task env + patch, repo, repo_path = reset_task_env(instance) + if repo_path is None: + return + file_changes = filter_changed_line(patch) + prompt = prompt_style_2_edits_only(instance, Path(repo_path), list(file_changes.keys())) + logger.info(prompt) + # todo: save output + return {} + + +def make_oracle_collapsed_dataset(dataset): + for datum in tqdm(dataset, desc="Inference "): + make_oracle_collapsed_instance(instance=datum) + # todo: save output diff --git a/swe_bench/make_datasets/make_instance.py b/swe_bench/make_datasets/make_instance.py new file mode 100644 index 0000000000..d5c6f55abe --- /dev/null +++ b/swe_bench/make_datasets/make_instance.py @@ -0,0 +1,193 @@ +from pathlib import Path + +import unidiff + +PATCH_EXAMPLE = """--- a/file.py ++++ b/file.py +@@ -1,27 +1,35 @@ + def euclidean(a, b): +- while b: +- a, b = b, a % b +- return a ++ if b == 0: ++ return a ++ return euclidean(b, a % b) + + + def bresenham(x0, y0, x1, y1): + points = [] + dx = abs(x1 - x0) + dy = abs(y1 - y0) +- sx = 1 if x0 < x1 else -1 +- sy = 1 if y0 < y1 else -1 +- err = dx - dy ++ x, y = x0, y0 ++ sx = -1 if x0 > x1 else 1 ++ sy = -1 if y0 > y1 else 1 + +- while True: +- points.append((x0, y0)) +- if x0 == x1 and y0 == y1: +- break +- e2 = 2 * err +- if e2 > -dy: ++ if dx > dy: ++ err = dx / 2.0 ++ while x != x1: ++ points.append((x, y)) + err -= dy +- x0 += sx +- if e2 < dx: +- err += dx +- y0 += sy ++ if err < 0: ++ y += sy ++ err += dx ++ x += sx ++ else: ++ err = dy / 2.0 ++ while y != y1: ++ points.append((x, y)) ++ err -= dx ++ if err < 0: ++ x += sx ++ err += dy ++ y += sy + ++ points.append((x, y)) + return points""" + +FULL_GENERATION_EXAMPLE = """[start of /src/this_file.py] +import os + +def euclidean(a, b): + if b == 0: + return a + return euclidean(b, a % b) +[end of /src/this_file.py] +[start of /src/another_file.py] +def bresenham(x0, y0, x1, y1): + points = [] + dx = abs(x1 - x0) + dy = abs(y1 - y0) + x, y = x0, y0 + sx = -1 if x0 > x1 else 1 + sy = -1 if y0 > y1 else 1 + if dx > dy: + err = dx / 2.0 + while x != x1: + points.append((x, y)) + err -= dy + if err < 0: + y += sy + err += dx + x += sx + else: + err = dy / 2.0 + while y != y1: + points.append((x + err -= dx + if err < 0: + x += sx + err += dy + y += sy + points.append((x, y)) + return points +[end of /src/another_file.py]""" + + +def add_lines_list(content): + content_with_lines = list() + for ix, line in enumerate(content.split("\n"), start=1): + content_with_lines.append(f"{ix} {line}") + return content_with_lines + + +def add_lines(content): + return "\n".join(add_lines_list(content)) + + +def make_code_text(files_dict, add_line_numbers=True): + all_text = "" + for filename, contents in sorted(files_dict.items()): + all_text += f"[start of {filename}]\n" + if add_line_numbers: + all_text += add_lines(contents) + else: + all_text += contents + all_text += f"\n[end of {filename}]\n" + return all_text.strip("\n") + + +def make_code_text_edits_only(files_dict, patch, root_path, add_line_numbers=True): + files = dict() + patch = unidiff.PatchSet(patch) + for patched_file in patch: + source_file = root_path / patched_file.source_file.split("a/", 1)[-1] + files[source_file] = list() + for hunk in patched_file: + start = hunk.source_start - 15 + end = start + hunk.source_length + 15 + files[source_file].append((start, end)) + all_text = "" + for filename, content in files_dict.items(): + # filename = str(filename) + all_text += f"[start of {filename}]\n" + content_with_lines = add_lines_list(content) + for start, end in files[filename]: + if start > 0: + all_text += "...\n" + all_text += "\n".join(content_with_lines[start:end]) + all_text += "\n" + if end < len(content_with_lines): + all_text += "...\n" + all_text = all_text.strip("\n") + all_text += f"\n[end of {filename}]\n" + return all_text.strip("\n") + + +def prompt_style_2_edits_only(instance, root_path, filenames): + premise = "You will be provided with a partial code base and an issue statement explaining a problem to resolve." + + readmes = get_readme_files(root_path) + instance["readmes"] = ingest_files([root_path / readme for readme in readmes]) + + readmes_text = make_code_text(instance["readmes"]) + instance["file_contents"] = ingest_files([root_path / filename for filename in filenames]) + code_text = make_code_text_edits_only(instance["file_contents"], instance["patch"], root_path) + instructions = ( + "I need you to solve this issue by generating a single patch file that I can apply " + + "directly to this repository using git apply. Please respond with a single patch " + + "file in the following format." + ) + problem_statement = instance["problem_statement"] + final_text = [ + premise, + "", + problem_statement, + "", + "", + readmes_text, + code_text, + "", + instructions, + "", + PATCH_EXAMPLE, + "", + ] + final_text = "\n".join(final_text) + return final_text + + +def ingest_files(file_paths): + files_dict = dict() + for file_path in file_paths: + files_dict[file_path] = Path.read_text(file_path, encoding="utf-8") + return files_dict + + +def get_readme_files(repo_path): + path = Path(repo_path) + # 检查文件名是否以 "readme" 开头,不区分大小写 + files = [file.name for file in path.iterdir() if file.is_file() and file.name.lower().startswith("readme")] + return files diff --git a/swe_bench/utils/repo_utils.py b/swe_bench/utils/repo_utils.py new file mode 100644 index 0000000000..8bdf776058 --- /dev/null +++ b/swe_bench/utils/repo_utils.py @@ -0,0 +1,93 @@ +import os +import shutil +import subprocess +from pathlib import Path +from typing import Dict + +import git +from git.exc import GitError + +from metagpt.logs import logger +from metagpt.utils.exceptions import handle_exception + +KEY_INSTANCE_ID = "instance_id" +RESET_FAILED = ">>>>> Reset Failed" + + +class ExecWrapper: + def __init__(self, subprocess_args: Dict = None): + self.subprocess_args = subprocess_args or {} + + @handle_exception(exception_type=subprocess.CalledProcessError) + def __call__(self, cmd, raise_error=True, **kwargs): + combined_args = {**self.subprocess_args, **kwargs} + output = subprocess.run(cmd, **combined_args) + return output + + +class EnvManager: + def __init__(self, testbed): + shellenv = os.environ.copy() + self.testbed = testbed + + self.exec = ExecWrapper( + subprocess_args={ + "check": True, + "shell": False, + "capture_output": True, + "text": True, + "env": shellenv, + } + ) + + @handle_exception(exception_type=GitError) + def clone_repo(self, repo_name: str, path: str, token: str = None): + if token is None: + token = os.environ.get("GITHUB_TOKEN", "git") + if not token: + raise ValueError("GitHub token is required for cloning repositories.") + + repo_url = f"https://{token}@github.com/swe-bench/{repo_name.replace('/', '__')}.git" + os.makedirs(path, exist_ok=True) + + # Clone the repository + git.Repo.clone_from(repo_url, path) + logger.info(f"Repository '{repo_name}' cloned successfully.") + + @handle_exception(exception_type=Exception) # Using a broad exception type for the example + def copy_repo(self, source_path: str, destination_path: str): + if not os.path.isdir(source_path): + raise ValueError("Source path does not exist or is not a directory.") + + os.makedirs(destination_path, exist_ok=True) + + # Copy the repository + try: + shutil.copytree( + source_path, destination_path, dirs_exist_ok=True + ) # For Python 3.8+, dirs_exist_ok handles existing directories + except TypeError: + # Fallback for Python < 3.8, where dirs_exist_ok is not available + if os.listdir(destination_path): # If destination is not empty + raise ValueError("Destination directory is not empty and dirs_exist_ok is not supported.") + shutil.copytree(source_path, destination_path) + + logger.info(f"Repository contents from '{source_path}' copied successfully to '{destination_path}'.") + + @handle_exception(exception_type=Exception, default_return=False) + def reset_task_env(self, instance: Dict): + """ + Reset task environment + testbed and checkout base commit of given task instance + """ + gitignore_path = Path(".gitignore") + if gitignore_path.exists(): + self.exec(["git", "ls-files", "--ignored", "--exclude-standard", "-o", "-z"], raise_error=False) + # fixme: need detect platform and change this cmd + # self.exec(["xargs", "-0", "-r", "rm", "-rf"], input=gitignore_path.read_text()) + + self.exec(["git", "restore", "."]) + self.exec(["git", "reset", "HEAD", "."]) + self.exec(["git", "clean", "-fdx"]) + self.exec(["git", "-c", "advice.detachedHead=false", "checkout", instance["base_commit"]]) + logger.info(f"[{instance['instance_id']}] Reset task environment to {instance['base_commit']}") + return True diff --git a/swe_bench/utils/utils.py b/swe_bench/utils/utils.py new file mode 100644 index 0000000000..283fcaaae1 --- /dev/null +++ b/swe_bench/utils/utils.py @@ -0,0 +1,81 @@ +import json +import os +import re + +from metagpt.logs import logger + + +def check_existing_ids(output_file): + existing_ids = set() + if os.path.exists(output_file): + with open(output_file, "r") as f: + for line in f: + data = json.loads(line) + instance_id = data["instance_id"] + existing_ids.add(instance_id) + logger.info(f"Read {len(existing_ids)} already completed ids from {output_file}") + return existing_ids + + +def extract_diff(response): + """ + Extracts the diff from a response formatted in different ways + """ + if response is None: + return None + diff_matches = [] + other_matches = [] + pattern = re.compile(r"\<([\w-]+)\>(.*?)\<\/\1\>", re.DOTALL) + for code, match in pattern.findall(response): + if code in {"diff", "patch"}: + diff_matches.append(match) + else: + other_matches.append(match) + pattern = re.compile(r"```(\w+)?\n(.*?)```", re.DOTALL) + for code, match in pattern.findall(response): + if code in {"diff", "patch"}: + diff_matches.append(match) + else: + other_matches.append(match) + if diff_matches: + return diff_matches[0] + if other_matches: + return other_matches[0] + return response.split("")[0] + + +def extract_scripts_from_codetext(codetext: str): + """ + Extracts Python script file names from a given text that contains multiple sections. + Each section starts with '[start of .py]' and ends with '[end of .py]'. + + Parameters: + - codetext (str): A string that may contain multiple sections, each indicating the start of a Python script file. + + Returns: + - list: A list of extracted Python script file names. + + Example of codetext: + ''' + [end of README.rst] + [start of sklearn/compose/_target.py] + ... file content ... + [end of sklearn/compose/_target.py] + [start of another_module/example.py] + ... file content ... + [end of another_module/example.py] + ''' + """ + script_names = [] + + # Match all occurrences of '[start of .py]' + matches = re.findall(r"\[start of ([^\]]+\.py)\]", codetext) + + if matches: + for script_name in matches: + print("Extracted script name:", script_name) + script_names.append(script_name) + else: + print("No script names found in the text.") + + return script_names