Describe the bug
Hello,
I have a question regarding the error described below.
It appears that the error occurs because the id values in the corpus data located in the benchmark/data folder do not match the id values retrieved from the VectorDB.
Full logs
[12/03/24 16:58:20] ERROR [__init__.py:60] >> Unexpected exception __init__.py:60
╭──────────────────────── Traceback (most recent call last) ────────────────────────╮
│ /root/.pyenv/versions/3.10.13/lib/python3.10/runpy.py:196 in _run_module_as_main │
│ │
│ 193 │ main_globals = sys.modules["__main__"].__dict__ │
│ 194 │ if alter_argv: │
│ 195 │ │ sys.argv[0] = mod_spec.origin │
│ ❱ 196 │ return _run_code(code, main_globals, None, │
│ 197 │ │ │ │ │ "__main__", mod_spec) │
│ 198 │
│ 199 def run_module(mod_name, init_globals=None, │
│ │
│ /root/.pyenv/versions/3.10.13/lib/python3.10/runpy.py:86 in _run_code │
│ │
│ 83 │ │ │ │ │ __loader__ = loader, │
│ 84 │ │ │ │ │ __package__ = pkg_name, │
│ 85 │ │ │ │ │ __spec__ = mod_spec) │
│ ❱ 86 │ exec(code, run_globals) │
│ 87 │ return run_globals │
│ 88 │
│ 89 def _run_module_code(code, init_globals=None, │
│ │
│ /root/.vscode-server/extensions/ms-python.debugpy-2024.12.0-linux-x64/bundled/lib │
│ s/debugpy/adapter/../../debugpy/launcher/../../debugpy/__main__.py:71 in <module> │
│ │
│ 68 │ │
│ 69 │ from debugpy.server import cli │
│ 70 │ │
│ ❱ 71 │ cli.main() │
│ 72 │
│ │
│ /root/.vscode-server/extensions/ms-python.debugpy-2024.12.0-linux-x64/bundled/lib │
│ s/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py:5 │
│ 01 in main │
│ │
│ 498 │ │ │ │ "code": run_code, │
│ 499 │ │ │ │ "pid": attach_to_pid, │
│ 500 │ │ │ }[options.target_kind] │
│ ❱ 501 │ │ │ run() │
│ 502 │ except SystemExit as exc: │
│ 503 │ │ log.reraise_exception( │
│ 504 │ │ │ "Debuggee exited via SystemExit: {0!r}", exc.code, level="debug │
│ │
│ /root/.vscode-server/extensions/ms-python.debugpy-2024.12.0-linux-x64/bundled/lib │
│ s/debugpy/adapter/../../debugpy/launcher/../../debugpy/../debugpy/server/cli.py:3 │
│ 51 in run_file │
│ │
│ 348 │ log.describe_environment("Pre-launch environment:") │
│ 349 │ │
│ 350 │ log.info("Running file {0!r}", target) │
│ ❱ 351 │ runpy.run_path(target, run_name="__main__") │
│ 352 │
│ 353 │
│ 354 def run_module(): │
│ │
│ /root/.vscode-server/extensions/ms-python.debugpy-2024.12.0-linux-x64/bundled/lib │
│ s/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py:310 in run_path │
│ │
│ 307 │ │ # Not a valid sys.path entry, so run the code directly │
│ 308 │ │ # execfile() doesn't help as we want to allow compiled files │
│ 309 │ │ code, fname = _get_code_from_file(run_name, path_name) │
│ ❱ 310 │ │ return _run_module_code(code, init_globals, run_name, pkg_name=pkg_ │
│ script_name=fname) │
│ 311 │ else: │
│ 312 │ │ # Finder is defined for path, so add it to │
│ 313 │ │ # the start of sys.path │
│ │
│ /root/.vscode-server/extensions/ms-python.debugpy-2024.12.0-linux-x64/bundled/lib │
│ s/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py:127 in _run_module_code │
│ │
│ 124 │ fname = script_name if mod_spec is None else mod_spec.origin │
│ 125 │ with _TempModule(mod_name) as temp_module, _ModifiedArgv0(fname): │
│ 126 │ │ mod_globals = temp_module.module.__dict__ │
│ ❱ 127 │ │ _run_code(code, mod_globals, init_globals, mod_name, mod_spec, pkg_ │
│ script_name) │
│ 128 │ # Copy the globals of the temporary module, as they │
│ 129 │ # may be cleared when the temporary module goes away │
│ 130 │ return mod_globals.copy() │
│ │
│ /root/.vscode-server/extensions/ms-python.debugpy-2024.12.0-linux-x64/bundled/lib │
│ s/debugpy/_vendored/pydevd/_pydevd_bundle/pydevd_runpy.py:118 in _run_code │
│ │
│ 115 │ run_globals.update( │
│ 116 │ │ __name__=mod_name, __file__=fname, __cached__=cached, __doc__=None, │
│ __loader__=loader, __package__=pkg_name, __spec__=mod_spec │
│ 117 │ ) │
│ ❱ 118 │ exec(code, run_globals) │
│ 119 │ return run_globals │
│ 120 │
│ 121 │
│ │
│ /app/evaluate/main.py:186 in <module> │
│ │
│ 183 │ │ │ opt["qa_data_path"] = os.path.join(current_dir, │
│ "qa","parsed_{}_chunk_{}_qa.parquet".format(i,j)) │
│ 184 │ │ │ opt["corpus_data_path"] = os.path.join(current_dir, │
│ "qa","parsed_{}_chunk_{}_corpus.parquet".format(i,j)) │
│ 185 │ │ │ opt["project_dir"] = os.path.join(current_dir, "benchmark") │
│ ❱ 186 │ │ │ evaluate(**opt) │
│ 187 │
│ │
│ /app/evaluate/main.py:158 in evaluate │
│ │
│ 155 │ │ os.makedirs(project_dir) │
│ 156 │ │
│ 157 │ evaluator = Evaluator(qa_data_path, corpus_data_path, project_dir=proje │
│ ❱ 158 │ evaluator.start_trial(config, skip_validation=True) │
│ 159 │
│ 160 │
│ 161 if __name__ == "__main__": │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/evaluator.py:206 in start_trial │
│ │
│ 203 │ │ │ │ if i == 0: │
│ 204 │ │ │ │ │ previous_result = self.qa_data │
│ 205 │ │ │ │ logger.info(f"Running node line {node_line_name}...") │
│ ❱ 206 │ │ │ │ previous_result = run_node_line( │
│ 207 │ │ │ │ │ node_line, node_line_dir, previous_result, progress, ta │
│ 208 │ │ │ │ ) │
│ 209 │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/node_line.py:52 in run_node_line │
│ │
│ 49 │ │
│ 50 │ summary_lst = [] │
│ 51 │ for node in nodes: │
│ ❱ 52 │ │ previous_result = node.run(previous_result, node_line_dir) │
│ 53 │ │ node_summary_df = load_summary_file( │
│ 54 │ │ │ os.path.join(node_line_dir, node.node_type, "summary.csv") │
│ 55 │ │ ) │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/schema/node.py:57 in run │
│ │
│ 54 │ def run(self, previous_result: pd.DataFrame, node_line_dir: str) -> pd. │
│ 55 │ │ logger.info(f"Running node {self.node_type}...") │
│ 56 │ │ input_modules, input_params = self.get_param_combinations() │
│ ❱ 57 │ │ return self.run_node( │
│ 58 │ │ │ modules=input_modules, │
│ 59 │ │ │ module_params=input_params, │
│ 60 │ │ │ previous_result=previous_result, │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/nodes/retrieval/run.py:173 in run_retrieval_node │
│ │
│ 170 │ │ │ │ zip(modules, module_params), │
│ 171 │ │ │ ) │
│ 172 │ │ ) │
│ ❱ 173 │ │ semantic_results, semantic_times = run(semantic_modules, semantic_m │
│ 174 │ │ semantic_summary_df = save_and_summary( │
│ 175 │ │ │ semantic_modules, │
│ 176 │ │ │ semantic_module_params, │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/nodes/retrieval/run.py:71 in run │
│ │
│ 68 │ │ :return: First, it returns list of result dataframe. │
│ 69 │ │ Second, it returns list of execution times. │
│ 70 │ │ """ │
│ ❱ 71 │ │ result, execution_times = zip( │
│ 72 │ │ │ *map( │
│ 73 │ │ │ │ lambda task: measure_speed( │
│ 74 │ │ │ │ │ task[0].run_evaluator, │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/nodes/retrieval/run.py:73 in <lambda> │
│ │
│ 70 │ │ """ │
│ 71 │ │ result, execution_times = zip( │
│ 72 │ │ │ *map( │
│ ❱ 73 │ │ │ │ lambda task: measure_speed( │
│ 74 │ │ │ │ │ task[0].run_evaluator, │
│ 75 │ │ │ │ │ project_dir=project_dir, │
│ 76 │ │ │ │ │ previous_result=previous_result, │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/strategy.py:14 in measure_speed │
│ │
│ 11 │ Method for measuring execution speed of the function. │
│ 12 │ """ │
│ 13 │ start_time = time.time() │
│ ❱ 14 │ result = func(*args, **kwargs) │
│ 15 │ end_time = time.time() │
│ 16 │ return result, end_time - start_time │
│ 17 │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/schema/base.py:27 in run_evaluator │
│ │
│ 24 │ ): │
│ 25 │ │ log_to_file(content=f"Running {cls.__name__} with {cls.run_evaluator │
│ 26 │ │ instance = cls(project_dir, *args, **kwargs) │
│ ❱ 27 │ │ result = instance.pure(previous_result, *args, **kwargs) │
│ 28 │ │ del instance │
│ 29 │ │ return result │
│ 30 │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/utils/util.py:72 in wrapper │
│ │
│ 69 │ def decorator_result_to_dataframe(func: Callable): │
│ 70 │ │ @functools.wraps(func) │
│ 71 │ │ def wrapper(*args, **kwargs) -> pd.DataFrame: │
│ ❱ 72 │ │ │ results = func(*args, **kwargs) │
│ 73 │ │ │ if len(column_names) == 1: │
│ 74 │ │ │ │ df_input = {column_names[0]: results} │
│ 75 │ │ │ else: │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/nodes/retrieval/vectordb.py:73 in pure │
│ │
│ 70 │ │ queries = self.cast_to_run(previous_result) │
│ 71 │ │ pure_params = pop_params(self._pure, kwargs) │
│ 72 │ │ ids, scores = self._pure(queries, **pure_params) │
│ ❱ 73 │ │ contents = fetch_contents(self.corpus_df, ids) │
│ 74 │ │ return contents, ids, scores │
│ 75 │ │
│ 76 │ def _pure( │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/utils/util.py:40 in fetch_contents │
│ │
│ 37 │ ): │
│ 38 │ │ return list(map(lambda x: fetch_one_content(corpus_data, x, column_ │
│ 39 │ │
│ ❱ 40 │ result = flatten_apply( │
│ 41 │ │ fetch_contents_pure, ids, corpus_data=corpus_data, column_name=colu │
│ 42 │ ) │
│ 43 │ return result │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/utils/util.py:377 in flatten_apply │
│ │
│ 374 │ """ │
│ 375 │ df = pd.DataFrame({"col1": nested_list}) │
│ 376 │ df = df.explode("col1") │
│ ❱ 377 │ df["result"] = func(df["col1"].tolist(), **kwargs) │
│ 378 │ return df.groupby(level=0, sort=False)["result"].apply(list).tolist() │
│ 379 │
│ 380 │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/utils/util.py:38 in fetch_contents_pure │
│ │
│ 35 │ def fetch_contents_pure( │
│ 36 │ │ ids: List[str], corpus_data: pd.DataFrame, column_name: str │
│ 37 │ ): │
│ ❱ 38 │ │ return list(map(lambda x: fetch_one_content(corpus_data, x, column_ │
│ 39 │ │
│ 40 │ result = flatten_apply( │
│ 41 │ │ fetch_contents_pure, ids, corpus_data=corpus_data, column_name=colu │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/utils/util.py:38 in <lambda> │
│ │
│ 35 │ def fetch_contents_pure( │
│ 36 │ │ ids: List[str], corpus_data: pd.DataFrame, column_name: str │
│ 37 │ ): │
│ ❱ 38 │ │ return list(map(lambda x: fetch_one_content(corpus_data, x, column_ │
│ 39 │ │
│ 40 │ result = flatten_apply( │
│ 41 │ │ fetch_contents_pure, ids, corpus_data=corpus_data, column_name=colu │
│ │
│ /root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/s │
│ ite-packages/autorag/utils/util.py:57 in fetch_one_content │
│ │
│ 54 │ │ │ return None │
│ 55 │ │ fetch_result = corpus_data[corpus_data[id_column_name] == id_] │
│ 56 │ │ if fetch_result.empty: │
│ ❱ 57 │ │ │ raise ValueError(f"doc_id: {id_} not found in corpus_data.") │
│ 58 │ │ else: │
│ 59 │ │ │ return fetch_result[column_name].iloc[0] │
│ 60 │ else: │
╰───────────────────────────────────────────────────────────────────────────────────╯
ValueError: doc_id: 5713eedc-1e2e-4395-8efe-06433f40a094 not found in corpus_data.
Exception ignored in: <function VectorDB.__del__ at 0x7f7096dde560>
Traceback (most recent call last):
File "/root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/site-packages/autorag/nodes/retrieval/vectordb.py", line 66, in __del__
File "/root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/site-packages/autorag/nodes/retrieval/base.py", line 28, in __del__
File "/root/.pyenv/versions/3.10.13/lib/python3.10/logging/__init__.py", line 1477, in info
File "/root/.pyenv/versions/3.10.13/lib/python3.10/logging/__init__.py", line 1624, in _log
File "/root/.pyenv/versions/3.10.13/lib/python3.10/logging/__init__.py", line 1634, in handle
File "/root/.pyenv/versions/3.10.13/lib/python3.10/logging/__init__.py", line 1696, in callHandlers
File "/root/.pyenv/versions/3.10.13/lib/python3.10/logging/__init__.py", line 968, in handle
File "/root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/site-packages/rich/logging.py", line 168, in emit
File "/root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/site-packages/rich/logging.py", line 229, in render
File "/root/.cache/pypoetry/virtualenvs/app-rag-sample-9TtSrW0h-py3.10/lib/python3.10/site-packages/rich/_log_render.py", line 43, in __call__
ImportError: sys.meta_path is None, Python is likely shutting down
When performing the evaluate
step repeatedly, items seem to accumulate in the VectorDB.
Should the number of items in the VectorDB collection always be reset to zero before performing the evaluate
step?
It would be very helpful to understand the intention behind implementing this flow, as it will assist me in using this package more effectively.
Thank you for your assistance!
Pay now to fund the work behind this issue.
Get updates on progress being made.
Maintainer is rewarded once the issue is completed.
You're funding impactful open source efforts
You want to contribute to this effort
You want to get funding like this too