RAG-Anything チュートリアル:Colab でテキスト、表、数式、画像を扱うマルチモーダル検索パイプラインの構築方法
MarkTechPost は、テキスト、表、数式、画像を統合的に処理する「RAG-Anything」の Colab チュートリアルを提供し、マルチモーダル検索の実装手順と検証手法を詳細に解説している。
キーポイント
包括的なマルチモーダル対応
本ツールはテキストだけでなく、数式や画像を含む表など多様なデータ形式を単一の RAG パイプラインで処理できる点を強調している。
安全な Colab 環境構築ガイド
OpenAI API キーのランタイム入力や依存関係(特に Pillow のバージョン管理)の修復など、実運用を想定した安全かつ再現性の高いセットアップ手順を示している。
多様な検索モードの実験
ナイーブ、ローカル、グローバル、ハイブリッドといった異なる検索戦略を比較検証するフレームワークを提供し、ユースケースに応じた最適な設定の選択を可能にする。
ディレクトリ構成と環境変数の初期化
プロジェクト用のベース、アセット、出力、ストレージ、ログディレクトリを自動作成し、RAG処理に必要なチャンクサイズや並列処理数などの設定値を環境変数として定義します。
API キーの安全な取得とクリーンアップ
ユーザー入力から API キーを取得し、Bearer トークンや余分な文字を除去して正規化し、セキュリティを確保した上でモデル設定に適用します。
OpenAI API の接続テストとモデル確認
チャット機能と埋め込み(embedding)機能の両方を非同期でテスト実行し、API キーの有効性と権限を確認してから処理を開始します。
環境設定と認証の準備
RAG-Anything の実行に必要なディレクトリ、ログ、ランタイム変数を構築し、OpenAI API キーを安全に取得してチャットおよび埋め込み機能の動作を検証します。
影響分析・編集コメントを表示
影響分析
この記事は、複雑なドキュメント形式(数式や表を含む PDF など)を扱う企業における RAG システムの実装課題に対する具体的な解決策を示しており、実務レベルでの導入検討を支援する重要なリソースとなります。特に、依存関係のトラブルシューティングまで含めた詳細な手順は、開発者が環境構築でつまずくリスクを大幅に低減し、マルチモーダル AI の普及を加速させる役割を果たします。
編集コメント
数式や表を含む複雑なドキュメントを扱う RAG システム構築において、環境構築のトラブルシューティングから検索戦略の比較まで網羅的に解説した非常に実用的なチュートリアルです。
本チュートリアルでは、RAG-Anything ワークフローを構築し、テキスト、表、数式、画像にわたるマルチモーダル検索の仕組みを探ります。まず Colab 環境を整え、必要なパッケージをインストールします。その後、実行時に OpenAI API キーを安全に入力することで、ノートブックを実用的かつ安全に実行できるようにします。次に、合成されたマルチモーダルレポートを作成し、チャートと PDF を生成します。コンテンツを RAG-Anything の直接形式である content_list 形式に変換し、検索システムに挿入します。
チュートリアルを進めるにつれ、クリーンな OpenAI ベースのチャット、ビジョン、埋め込み関数を設定し、RAG-Anything を初期化します。さらに、単純検索 (naive)、ローカル検索 (local)、グローバル検索 (global)、ハイブリッド検索 (hybrid) といった異なる検索モードをテストします。
Installing RAG-Anything Dependencies
Copy CodeCopiedUse a different Browser
import os
import re
import sys
import json
import time
import shutil
import hashlib
import asyncio
import inspect
import getpass
import subprocess
import importlib
import importlib.metadata
from pathlib import Path
from typing import List, Dict, Any
def run_shell(cmd, check=True):
print(f"\n$ {cmd}")
result = subprocess.run(cmd, shell=True, text=True)
if check and result.returncode != 0:
raise RuntimeError(f"Command failed: {cmd}")
return result.returncode
print("=" * 80)
print("RAG-Anything Advanced Colab Tutorial")
print("=" * 80)
print("\n[1/10] Installing dependencies...")
for module_name in list(sys.modules):
if module_name == "PIL" or module_name.startswith("PIL."):
del sys.modules[module_name]
run_shell(
'pip -q install -U '
'"raganything[image,text]" '
'"openai>=1.0.0" '
'"python-dotenv" '
'"reportlab" '
'"pandas" '
'"matplotlib" '
'"tabulate"'
)
run_shell('pip -q install --no-cache-dir --force-reinstall "pillow==11.3.0"')
for module_name in list(sys.modules):
if module_name == "PIL" or module_name.startswith("PIL."):
del sys.modules[module_name]
importlib.invalidate_caches()
try:
print("Pillow version:", importlib.metadata.version("Pillow"))
except Exception as e:
print("Could not read Pillow version:", repr(e))
print("\n[2/10] Importing libraries...")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch
from openai import AsyncOpenAI
from raganything import RAGAnything, RAGAnythingConfig
from lightrag.utils import EmbeddingFunc
print("Imports successful.")
RAG-Anything ワークフローのための完全な Colab 環境のセットアップから始めます。必要なライブラリをインストールし、Pillow の依存関係を修復し、プロット作成、PDF 生成、OpenAI アクセス、および RAG-Anything に必要なすべてのモジュールを読み込みます。また、セットアップが明確で再実行しやすいように、再利用可能なシェルヘルパーも定義します。
ディレクトリとランタイム変数の設定
コードをコピーしました
別のブラウザを使用してください
print("\n[3/10] ディレクトリとランタイム設定の準備中...")
BASE_DIR = Path("/content/raganything_advanced_tutorial") if Path("/content").exists() else Path.cwd() / "raganything_advanced_tutorial"
ASSET_DIR = BASE_DIR / "assets"
OUTPUT_DIR = BASE_DIR / "output"
WORKING_DIR = BASE_DIR / "rag_storage"
LOG_DIR = BASE_DIR / "logs"
RESET_STORAGE = True
RUN_FULL_DOCUMENT_PARSE = False
PARSER_FOR_FULL_PARSE = "mineru"
PARSE_METHOD = "auto"
for d in [BASE_DIR, ASSET_DIR, OUTPUT_DIR, WORKING_DIR, LOG_DIR]:
d.mkdir(parents=True, exist_ok=True)
if RESET_STORAGE and WORKING_DIR.exists():
shutil.rmtree(WORKING_DIR)
WORKING_DIR.mkdir(parents=True, exist_ok=True)
os.environ["LOG_DIR"] = str(LOG_DIR)
os.environ["SUMMARY_LANGUAGE"] = "English"
os.environ["ENABLE_LLM_CACHE"] = "false"
os.environ["ENABLE_LLM_CACHE_FOR_EXTRACT"] = "false"
os.environ["MAX_ASYNC"] = "2"
os.environ["CHUNK_SIZE"] = "900"
os.environ["CHUNK_OVERLAP_SIZE"] = "120"
os.environ["TIMEOUT"] = "240"
for var in [
"OPENAI_API_KEY",
"OPENAI_ORG_ID",
"OPENAI_ORGANIZATION",
"OPENAI_PROJECT",
"OPENAI_DEFAULT_HEADERS",
"LLM_BINDING_API_KEY",
"LLM_BINDING_HOST",
]:
os.environ.pop(var, None)
print(f"ベースディレクトリ: {BASE_DIR}")
print(f"アセットディレクトリ: {ASSET_DIR}")
print(f"ストレージディレクトリ: {WORKING_DIR}")
print("\n[4/10] OpenAI API キーを安全に入力中...")
def clean_api_key(raw_value: str) -> str:
raw_value = str(raw_value or "").strip()
raw_value = raw_value.replace("Bearer ", "").replace("bearer ", "").strip()
raw_value = raw_value.strip("'").strip('"').strip("`").strip()
if "=" in raw_value:
raw_value = raw_value.split("=", 1)[1].strip().strip("'").strip('"').strip("`")
raw_value = re.sub(r"\s+", "", raw_value)
raw_value = raw_value.encode("ascii", errors="ignore").decode("ascii").strip()
return raw_value
OPENAI_API_KEY_RAW = getpass.getpass("OpenAI API キーをここに貼り付けてください。入力は非表示です:")
OPENAI_API_KEY = clean_api_key(OPENAI_API_KEY_RAW)
if not OPENAI_API_KEY:
raise ValueError(
"API キーが取得されませんでした。キーを非表示の入力ボックスに貼り付けて Enter キーを押してください。"
)
print("取得したキーの長さ:", len(OPENAI_API_KEY))
print("取得したキーのプレフィックス:", OPENAI_API_KEY[:12] + "...")
print("取得したキーのサフィックス:", "..." + OPENAI_API_KEY[-6:])
LLM_MODEL = "gpt-4o-mini"
VISION_MODEL = "gpt-4o-mini"
EMBEDDING_MODEL = "text-embedding-3-small"
EMBEDDING_DIM = 1536
openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
os.environ["LLM_MODEL"] = LLM_MODEL
os.environ["VISION_MODEL"] = VISION_MODEL
os.environ["EMBEDDING_MODEL"] = EMBEDDING_MODEL
os.environ["EMBEDDING_DIM"] = str(EMBEDDING_DIM)
print("取得したキーで OpenAI チャット API をテスト中...")
try:
test_response = await openai_client.chat.completions.create(
model=LLM_MODEL,
messages=[{"role": "user", "content": "Exactly: ok とだけ返信してください"}],
temperature=0,
)
print("チャット API テスト応答:", test_response.choices[0].message.content)
except Exception as e:
raise RuntimeError(
"キーは取得されましたが、OpenAI がリクエストを拒否したか、アカウント/モデルへのアクセスに失敗しました。"
"請求状況やプロジェクトの権限を確認し、これが OpenAI Platform API キーであることを確認してください。"
) from e
print("\nOpenAI 埋め込み (embedding) API をテスト中...")
try:
test_embedding = await openai_client.embeddings.create(
model=EMBEDDING_MODEL,
input=["RAG-Anything 埋め込みテスト"],
)
print("埋め込みベクトルの長さ:", len(test_embedding.data[0].embedding))
except Exception as e:
raise RuntimeError(
"チャットは動作しましたが、埋め込み (embeddings) に失敗しました。API キーに埋め込み権限があることを確認してください。"
) from e
print("OpenAI API キーは正常に動作しています。")
print(f"チャットモデル: {LLM_MODEL}")
print(f"ビジョンモデル: {VISION_MODEL}")
print(f"埋め込みモデル: {EMBEDDING_MODEL}")
print(f"埋め込み次元数: {EMBEDDING_DIM}")
RAG-Anything の実行中に使用される作業ディレクトリ、出力フォルダ、ログ、およびランタイム環境変数を準備します。OpenAI API キーは非表示の入力フィールドを通じて安全に取得し、貼り付けられた値をクリーンアップして、チャット呼び出しと埋め込み呼び出しの両方が正しく機能することを確認します。また、本チュートリアルの残りの部分を支えるモデルと埋め込み次元数も定義します。
合成マルチモーダルレポートの生成
コードをコピーしました
別のブラウザを使用する
print("\n[5/10] Creating a synthetic multimodal report...")
monthly_data = pd.DataFrame(
{
"Month": ["Jan", "Feb", "Mar", "Apr", "May", "Jun"],
"Query Volume": [1200, 1700, 2100, 2600, 3300, 4100],
"Hybrid Accuracy": [0.71, 0.74, 0.79, 0.83, 0.87, 0.91],
"Average Latency ms": [980, 920, 850, 790, 760, 730],
}
)
table_md = monthly_data.to_markdown(index=False)
plt.figure(figsize=(8, 4.8))
plt.plot(monthly_data["Month"], monthly_data["Query Volume"], marker="o", label="Query Volume")
plt.plot(monthly_data["Month"], monthly_data["Hybrid Accuracy"] * 4000, marker="s", label="Hybrid Accuracy scaled")
plt.title("Multimodal RAG Usage and Quality Trend")
plt.xlabel("Month")
plt.ylabel("Volume / Scaled Accuracy")
plt.legend()
plt.grid(True, alpha=0.3)
plt.text(
0.02,
0.95,
"Synthetic figure: usage rises while latency falls",
transform=plt.gca().transAxes,
fontsize=9,
verticalalignment="top",
bbox=dict(boxstyle="round", alpha=0.15),
)
chart_path = ASSET_DIR / "raganything_quality_trend.png"
plt.tight_layout()
plt.savefig(chart_path, dpi=180)
plt.close()
report_pdf_path = ASSET_DIR / "synthetic_multimodal_rag_report.pdf"
c = canvas.Canvas(str(report_pdf_path), pagesize=letter)
width, height = letter
c.setFont("Helvetica-Bold", 18)
c.drawString(0.8 * inch, height - 0.8 * inch, "Synthetic Multimodal RAG Evaluation Report")
c.setFont("Helvetica", 10)
intro_lines = [
"This report evaluates a synthetic multimodal RAG pipeline for enterprise documents.",
"The knowledge base includes text, tables, equations, and visual evidence.",
"The central hypothesis is that hybrid retrieval improves answer quality when evidence spans modalities.",
]
y = height - 1.25 * inch
for line in intro_lines:
c.drawString(0.8 * inch, y, line)
y -= 0.22 * inch
c.setFont("Helvetica-Bold", 12)
c.drawString(0.8 * inch, y - 0.1 * inch, "Table 1. Monthly system measurements")
y -= 0.4 * inch
c.setFont("Courier", 7.5)
for row in table_md.splitlines():
c.drawString(0.8 * inch, y, row[:120])
y -= 0.17 * inch
c.setFont("Helvetica-Bold", 12)
c.drawString(0.8 * inch, y - 0.15 * inch, "Equation 1. Weighted multimodal score")
y -= 0.45 * inch
c.setFont("Helvetica", 9)
c.drawString(
0.8 * inch,
y,
"Score(q, d) = alpha * Sim_text(q, d) + beta * Sim_graph(q, d) + gamma * Sim_visual(q, d)",
)
y -= 0.5 * inch
c.drawImage(str(chart_path), 0.8 * inch, y - 2.8 * inch, width=6.5 * inch, height=2.6 * inch)
c.showPage()
c.setFont("Helvetica-Bold", 16)
c.drawString(0.8 * inch, height - 0.8 * inch, "Interpretation and Findings")
c.setFont("Helvetica", 10)
findings = [
"Hybrid retrieval combines semantic similarity with graph-based relationship navigation.",
"The synthetic table shows accuracy improving from 0.71 to 0.91 over six months.",
"The generated figure shows query volume increasing while latency gradually decreases.",
"Equation-level retrieval is useful when the question depends on scoring logic rather than plain prose.",
"A multimodal system should preserve page index, captions, footnotes, and local image paths for traceability.",
]
y = height - 1.25 * inch
for finding in findings:
c.drawString(0.8 * inch, y, "- " + finding)
y -= 0.28 * inch
c.save()
print(f"Created chart: {chart_path}")
print(f"Created PDF: {report_pdf_path}")
print("\nSynthetic table:")
display(monthly_data)
RAG-Anything のテスト用に、現実的なコンテンツを含む合成マルチモーダルレポートを作成します。小規模なパフォーマンス表を構築し、チャートを生成して、テキスト、表、数式、図を含む PDF をエクスポートします。この制御されたドキュメントを用いて、システムが異なるコンテンツタイプをどのように処理するかを明確に観察します。
テキスト用の RAG-Anything content_list の構築
コードをコピーしました(コピー済み)
別のブラウザを使用してください
print("\n[6/10] Building direct multimodal content_list...")
content_list: List[Dict[str, Any]] = [
{
"type": "text",
"text": (
"This synthetic report evaluates a multimodal retrieval augmented generation system. "
"The system indexes textual explanations, a structured performance table, a scoring equation, "
"and a trend figure. The main goal is to answer questions whose evidence is distributed across "
"several document modalities rather than one plain text passage."
),
"page_idx": 0,
},
{
"type": "table",
"table_body": table_md,
"table_caption": ["Table 1: Monthly query volume, hybrid accuracy, and average latency."],
"table_footnote": ["Synthetic measurements created for a Colab tutorial."],
"page_idx": 0,
},
{
"type": "equation",
"latex": r"Score(q,d)=\alpha \cdot Sim_{text}(q,d)+\beta \cdot Sim_{graph}(q,d)+\gamma \cdot Sim_{visual}(q,d)",
"text": (
"Weighted multimodal retrieval score. Alpha controls text similarity, beta controls graph relationship "
"similarity, and gamma controls visual similarity."
),
"page_idx": 0,
},
{
"type": "image",
"img_path": str(chart_path.resolve()),
"image_caption": ["Figure 1: Multimodal RAG usage and quality trend."],
"image_footnote": ["The line chart is synthetic and generated inside this tutorial."],
"page_idx": 0,
},
{
"type": "text",
"text": (
"The key finding is that hybrid retrieval is preferred for cross-modal questions. "
"Local retrieval is useful for entity-specific lookup, global retrieval is useful for broader themes, "
"and naive retrieval is a simpler baseline. In this report, hybrid accuracy rises from 0.71 in January "
"to 0.91 in June, while average latency drops from 980 milliseconds to 730 milliseconds."
),
"page_idx": 1,
},
]
content_list_path = ASSET_DIR / "content_list.json"
with open(content_list_path, "w", encoding="utf-8") as f:
json.dump(content_list, f, indent=2, ensure_ascii=False)
print(f"Saved content list: {content_list_path}")
合成レポートを RAG-Anything の直接コンテンツリスト形式に変換します。テキスト、表、数式、画像は、キャプション、脚注、ページインデックス、画像パスを持つ個別の構造化ブロックとして表現されます。このマルチモーダルコンテンツのリストを JSON 形式で保存することで、ワークフローの透明性と再利用性を確保します。
OpenAI のチャット、ビジョン、埋め込み関数の定義
コードをコピーしました(コピー済み)
別のブラウザを使用してください
print("\n[7/10] Defining clean OpenAI model and embedding functions...")
async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
messages = []
if system_prompt:
messages.append({"role": "system", "content": str(system_prompt)})
for msg in history_messages or []:
if isinstance(msg, dict) and "role" in msg and "content" in msg:
messages.append(msg)
messages.append({"role": "user", "content": str(prompt)})
allowed_kwargs = {}
for key in ["temperature", "top_p", "max_tokens", "response_format"]:
if key in kwargs and kwargs[key] is not None:
allowed_kwargs[key] = kwargs[key]
response = await openai_client.chat.completions.create(
model=LLM_MODEL,
messages=messages,
**allowed_kwargs,
)
return response.choices[0].message.content or ""
async def vision_model_func(
prompt,
system_prompt=None,
history_messages=None,
image_data=None,
messages=None,
**kwargs,
):
allowed_kwargs = {}
for key in ["temperature", "top_p", "max_tokens", "response_format"]:
if key in kwargs and kwargs[key] is not None:
allowed_kwargs[key] = kwargs[key]
if messages:
clean_messages = [m for m in messages if m is not None]
response = await openai_client.chat.completions.create(
model=VISION_MODEL,
messages=clean_messages,
**allowed_kwargs,
)
return response.choices[0].message.content or ""
built_messages = []
if system_prompt:
built_messages.append({"role": "system", "content": str(system_prompt)})
for msg in history_messages or []:
if isinstance(msg, dict) and "role" in msg and "content" in msg:
built_messages.append(msg)
if image_data:
built_messages.append(
{
"role": "user",
"content": [
{"type": "text", "text": str(prompt)},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
},
],
}
)
else:
built_messages.append({"role": "user", "content": str(prompt)})
response = await openai_client.chat.completions.create(
model=VISION_MODEL,
messages=built_messages,
**allowed_kwargs,
)
return response.choices[0].message.content or ""
async def openai_embedding_func(texts, **kwargs):
if isinstance(texts, str):
texts = [texts]
texts = [str(t) for t in texts]
response = await openai_client.embeddings.create(
model=EMBEDDING_MODEL,
input=texts,
)
vectors = [item.embedding for item in response.data]
return np.array(vectors, dtype=np.float32)
embedding_func = EmbeddingFunc(
embedding_dim=EMBEDDING_DIM,
max_token_size=8192,
func=openai_embedding_func,
)
print("Model and embedding functions ready.")
テキスト生成、ビジョン生成、埋め込み作成のために、クリーンな OpenAI 駆動関数を定義します。システムプロンプト、チャット履歴、マルチモーダル画像入力、およびオプションのモデルパラメータを制御された方法で処理します。その後、埋め込み関数を LightRAG の EmbeddingFunc でラップし、RAG-Anything がインデックス作成と検索時にこれを利用できるようにします。
RAG-Anything の初期化とハイブリッド検索の実行
コードをコピーしました(コピー済み)
異なるブラウザを使用してください
print("\n[8/10] Initializing RAG-Anything...")
config = RAGAnythingConfig(
working_dir=str(WORKING_DIR),
parser=PARSER_FOR_FULL_PARSE,
parse_method=PARSE_METHOD,
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
)
rag = RAGAnything(
config=config,
llm_model_func=llm_model_func,
vision_model_func=vision_model_func,
embedding_func=embedding_func,
)
async def maybe_await(value):
if inspect.isawaitable(value):
return await value
return value
if hasattr(rag, "initialize_storages"):
try:
await maybe_await(rag.initialize_storages())
print("RAG-Anything storages initialized.")
except Exception as e:
print("Storage initialization skipped or already handled:", repr(e))
print(f"Working directory: {WORKING_DIR}")
print("\n[9/10] Inserting multimodal content and running retrieval queries...")
async def insert_demo_content():
await rag.insert_content_list(
content_list=content_list,
file_path=str(report_pdf_path.name),
split_by_character=None,
split_by_character_only=False,
doc_id="synthetic-multimodal-rag-report",
display_stats=True,
)
await insert_demo_content()
print("Insertion complete.")
queries = [
"What is the main purpose of the multimodal RAG report?",
"How did hybrid accuracy and latency change from January to June?",
"Why is hybrid retrieval better than naive retrieval for this report?",
"What does the weighted multimodal score equation mean?",
]
async def safe_aquery(question, mode="hybrid", vlm_enhanced=False):
try:
return await rag.aquery(question, mode=mode, vlm_enhanced=vlm_enhanced)
except TypeError:
return await rag.aquery(question, mode=mode)
async def run_query_suite():
results = []
for mode in ["naive", "local", "global", "hybrid"]:
print("\n" + "=" * 80)
print(f"QUERY MODE: {mode.upper()}")
print("=" * 80)
for q in queries:
print(f"\nQuestion: {q}")
try:
answer = await safe_aquery(q, mode=mode, vlm_enhanced=False)
except Exception as e:
answer = f"Query failed in mode={mode}: {repr(e)}"
print("\nAnswer:")
print(answer)
print("-" * 80)
results.append(
{
"mode": mode,
"question": q,
"answer_preview": str(answer)[:700],
}
)
return pd.DataFrame(results)
query_results_df = await run_query_suite()
print("\nQuery result preview:")
display(query_results_df)
RAG-Anything を初期化し、作業ディレクトリ、パーサー設定、およびマルチモーダル処理オプションを有効にします。準備されたマルチモーダルコンテンツリストをシステムに挿入し、RAG-Anything にテキスト、表、数式、画像ブロックの処理を行わせます。その後、複数の検索を実行します
原文を表示
In this tutorial, we build a RAG-Anything workflow and use it to explore how multimodal retrieval works across text, tables, equations, and images. We start by preparing the Colab environment, installing the required packages, and securely entering our OpenAI API key at runtime to keep the notebook practical and safe to run. We then create a synthetic multimodal report, generate a chart and PDF, convert the content into RAG-Anything’s direct content_list format, and insert it into the retrieval system. As we move through the tutorial, we configure clean OpenAI-based chat, vision, and embedding functions, initialize RAG-Anything, and test different retrieval modes such as naive, local, global, and hybrid.
Installing RAG-Anything Dependencies
Copy CodeCopiedUse a different Browser
import os
import re
import sys
import json
import time
import shutil
import hashlib
import asyncio
import inspect
import getpass
import subprocess
import importlib
import importlib.metadata
from pathlib import Path
from typing import List, Dict, Any
def run_shell(cmd, check=True):
print(f"\n$ {cmd}")
result = subprocess.run(cmd, shell=True, text=True)
if check and result.returncode != 0:
raise RuntimeError(f"Command failed: {cmd}")
return result.returncode
print("=" * 80)
print("RAG-Anything Advanced Colab Tutorial")
print("=" * 80)
print("\n[1/10] Installing dependencies...")
for module_name in list(sys.modules):
if module_name == "PIL" or module_name.startswith("PIL."):
del sys.modules[module_name]
run_shell(
'pip -q install -U '
'"raganything[image,text]" '
'"openai>=1.0.0" '
'"python-dotenv" '
'"reportlab" '
'"pandas" '
'"matplotlib" '
'"tabulate"'
)
run_shell('pip -q install --no-cache-dir --force-reinstall "pillow==11.3.0"')
for module_name in list(sys.modules):
if module_name == "PIL" or module_name.startswith("PIL."):
del sys.modules[module_name]
importlib.invalidate_caches()
try:
print("Pillow version:", importlib.metadata.version("Pillow"))
except Exception as e:
print("Could not read Pillow version:", repr(e))
print("\n[2/10] Importing libraries...")
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from IPython.display import display
from reportlab.lib.pagesizes import letter
from reportlab.pdfgen import canvas
from reportlab.lib.units import inch
from openai import AsyncOpenAI
from raganything import RAGAnything, RAGAnythingConfig
from lightrag.utils import EmbeddingFunc
print("Imports successful.")
We begin by setting up the complete Colab environment for the RAG-Anything workflow. We install the required libraries, repair the Pillow dependency, and import all the modules needed for plotting, PDF creation, OpenAI access, and RAG-Anything. We also define a reusable shell helper so the setup remains clear and easy to rerun.
Configuring Directories, Runtime Variables
Copy CodeCopiedUse a different Browser
print("\n[3/10] Preparing directories and runtime settings...")
BASE_DIR = Path("/content/raganything_advanced_tutorial") if Path("/content").exists() else Path.cwd() / "raganything_advanced_tutorial"
ASSET_DIR = BASE_DIR / "assets"
OUTPUT_DIR = BASE_DIR / "output"
WORKING_DIR = BASE_DIR / "rag_storage"
LOG_DIR = BASE_DIR / "logs"
RESET_STORAGE = True
RUN_FULL_DOCUMENT_PARSE = False
PARSER_FOR_FULL_PARSE = "mineru"
PARSE_METHOD = "auto"
for d in [BASE_DIR, ASSET_DIR, OUTPUT_DIR, WORKING_DIR, LOG_DIR]:
d.mkdir(parents=True, exist_ok=True)
if RESET_STORAGE and WORKING_DIR.exists():
shutil.rmtree(WORKING_DIR)
WORKING_DIR.mkdir(parents=True, exist_ok=True)
os.environ["LOG_DIR"] = str(LOG_DIR)
os.environ["SUMMARY_LANGUAGE"] = "English"
os.environ["ENABLE_LLM_CACHE"] = "false"
os.environ["ENABLE_LLM_CACHE_FOR_EXTRACT"] = "false"
os.environ["MAX_ASYNC"] = "2"
os.environ["CHUNK_SIZE"] = "900"
os.environ["CHUNK_OVERLAP_SIZE"] = "120"
os.environ["TIMEOUT"] = "240"
for var in [
"OPENAI_API_KEY",
"OPENAI_ORG_ID",
"OPENAI_ORGANIZATION",
"OPENAI_PROJECT",
"OPENAI_DEFAULT_HEADERS",
"LLM_BINDING_API_KEY",
"LLM_BINDING_HOST",
]:
os.environ.pop(var, None)
print(f"Base directory: {BASE_DIR}")
print(f"Assets directory: {ASSET_DIR}")
print(f"Storage directory: {WORKING_DIR}")
print("\n[4/10] Entering OpenAI API key securely...")
def clean_api_key(raw_value: str) -> str:
raw_value = str(raw_value or "").strip()
raw_value = raw_value.replace("Bearer ", "").replace("bearer ", "").strip()
raw_value = raw_value.strip("'").strip('"').strip("`").strip()
if "=" in raw_value:
raw_value = raw_value.split("=", 1)[1].strip().strip("'").strip('"').strip("`")
raw_value = re.sub(r"\s+", "", raw_value)
raw_value = raw_value.encode("ascii", errors="ignore").decode("ascii").strip()
return raw_value
OPENAI_API_KEY_RAW = getpass.getpass("Paste your OpenAI API key here. Input is hidden: ")
OPENAI_API_KEY = clean_api_key(OPENAI_API_KEY_RAW)
if not OPENAI_API_KEY:
raise ValueError(
"No API key was captured. Paste the key into the hidden input box and press Enter."
)
print("Captured key length:", len(OPENAI_API_KEY))
print("Captured key prefix:", OPENAI_API_KEY[:12] + "...")
print("Captured key suffix:", "..." + OPENAI_API_KEY[-6:])
LLM_MODEL = "gpt-4o-mini"
VISION_MODEL = "gpt-4o-mini"
EMBEDDING_MODEL = "text-embedding-3-small"
EMBEDDING_DIM = 1536
openai_client = AsyncOpenAI(api_key=OPENAI_API_KEY)
os.environ["LLM_MODEL"] = LLM_MODEL
os.environ["VISION_MODEL"] = VISION_MODEL
os.environ["EMBEDDING_MODEL"] = EMBEDDING_MODEL
os.environ["EMBEDDING_DIM"] = str(EMBEDDING_DIM)
print("Testing OpenAI chat API with the captured key...")
try:
test_response = await openai_client.chat.completions.create(
model=LLM_MODEL,
messages=[{"role": "user", "content": "Reply with exactly: ok"}],
temperature=0,
)
print("Chat API test response:", test_response.choices[0].message.content)
except Exception as e:
raise RuntimeError(
"The key was captured, but OpenAI rejected the request or the account/model access failed. "
"Check billing, project permissions, and make sure this is an OpenAI Platform API key."
) from e
print("\nTesting OpenAI embedding API...")
try:
test_embedding = await openai_client.embeddings.create(
model=EMBEDDING_MODEL,
input=["RAG-Anything embedding test"],
)
print("Embedding vector length:", len(test_embedding.data[0].embedding))
except Exception as e:
raise RuntimeError(
"Chat worked, but embeddings failed. Make sure your API key has permission for embeddings."
) from e
print("OpenAI API key is working.")
print(f"Chat model: {LLM_MODEL}")
print(f"Vision model: {VISION_MODEL}")
print(f"Embedding model: {EMBEDDING_MODEL}")
print(f"Embedding dimension: {EMBEDDING_DIM}")
We prepare the working directories, output folders, logs, and runtime environment variables that RAG-Anything uses during execution. We securely capture the OpenAI API key via a hidden input, clean the pasted value, and verify that both the chat and embedding calls work correctly. We also define the models and embedding dimensions that power the rest of the tutorial.
Generating a Synthetic Multimodal Report
Copy CodeCopiedUse a different Browser
print("\n[5/10] Creating a synthetic multimodal report...")
monthly_data = pd.DataFrame(
{
"Month": ["Jan", "Feb", "Mar", "Apr", "May", "Jun"],
"Query Volume": [1200, 1700, 2100, 2600, 3300, 4100],
"Hybrid Accuracy": [0.71, 0.74, 0.79, 0.83, 0.87, 0.91],
"Average Latency ms": [980, 920, 850, 790, 760, 730],
}
)
table_md = monthly_data.to_markdown(index=False)
plt.figure(figsize=(8, 4.8))
plt.plot(monthly_data["Month"], monthly_data["Query Volume"], marker="o", label="Query Volume")
plt.plot(monthly_data["Month"], monthly_data["Hybrid Accuracy"] * 4000, marker="s", label="Hybrid Accuracy scaled")
plt.title("Multimodal RAG Usage and Quality Trend")
plt.xlabel("Month")
plt.ylabel("Volume / Scaled Accuracy")
plt.legend()
plt.grid(True, alpha=0.3)
plt.text(
0.02,
0.95,
"Synthetic figure: usage rises while latency falls",
transform=plt.gca().transAxes,
fontsize=9,
verticalalignment="top",
bbox=dict(boxstyle="round", alpha=0.15),
)
chart_path = ASSET_DIR / "raganything_quality_trend.png"
plt.tight_layout()
plt.savefig(chart_path, dpi=180)
plt.close()
report_pdf_path = ASSET_DIR / "synthetic_multimodal_rag_report.pdf"
c = canvas.Canvas(str(report_pdf_path), pagesize=letter)
width, height = letter
c.setFont("Helvetica-Bold", 18)
c.drawString(0.8 * inch, height - 0.8 * inch, "Synthetic Multimodal RAG Evaluation Report")
c.setFont("Helvetica", 10)
intro_lines = [
"This report evaluates a synthetic multimodal RAG pipeline for enterprise documents.",
"The knowledge base includes text, tables, equations, and visual evidence.",
"The central hypothesis is that hybrid retrieval improves answer quality when evidence spans modalities.",
]
y = height - 1.25 * inch
for line in intro_lines:
c.drawString(0.8 * inch, y, line)
y -= 0.22 * inch
c.setFont("Helvetica-Bold", 12)
c.drawString(0.8 * inch, y - 0.1 * inch, "Table 1. Monthly system measurements")
y -= 0.4 * inch
c.setFont("Courier", 7.5)
for row in table_md.splitlines():
c.drawString(0.8 * inch, y, row[:120])
y -= 0.17 * inch
c.setFont("Helvetica-Bold", 12)
c.drawString(0.8 * inch, y - 0.15 * inch, "Equation 1. Weighted multimodal score")
y -= 0.45 * inch
c.setFont("Helvetica", 9)
c.drawString(
0.8 * inch,
y,
"Score(q, d) = alpha * Sim_text(q, d) + beta * Sim_graph(q, d) + gamma * Sim_visual(q, d)",
)
y -= 0.5 * inch
c.drawImage(str(chart_path), 0.8 * inch, y - 2.8 * inch, width=6.5 * inch, height=2.6 * inch)
c.showPage()
c.setFont("Helvetica-Bold", 16)
c.drawString(0.8 * inch, height - 0.8 * inch, "Interpretation and Findings")
c.setFont("Helvetica", 10)
findings = [
"Hybrid retrieval combines semantic similarity with graph-based relationship navigation.",
"The synthetic table shows accuracy improving from 0.71 to 0.91 over six months.",
"The generated figure shows query volume increasing while latency gradually decreases.",
"Equation-level retrieval is useful when the question depends on scoring logic rather than plain prose.",
"A multimodal system should preserve page index, captions, footnotes, and local image paths for traceability.",
]
y = height - 1.25 * inch
for finding in findings:
c.drawString(0.8 * inch, y, "- " + finding)
y -= 0.28 * inch
c.save()
print(f"Created chart: {chart_path}")
print(f"Created PDF: {report_pdf_path}")
print("\nSynthetic table:")
display(monthly_data)
We create a synthetic multimodal report that provides realistic content for testing in RAG-Anything. We build a small performance table, generate a chart, and export a PDF containing text, a table, an equation, and a figure. We use this controlled document to clearly observe how the system handles different content types.
Building the RAG-Anything content_list for Text
Copy CodeCopiedUse a different Browser
print("\n[6/10] Building direct multimodal content_list...")
content_list: List[Dict[str, Any]] = [
{
"type": "text",
"text": (
"This synthetic report evaluates a multimodal retrieval augmented generation system. "
"The system indexes textual explanations, a structured performance table, a scoring equation, "
"and a trend figure. The main goal is to answer questions whose evidence is distributed across "
"several document modalities rather than one plain text passage."
),
"page_idx": 0,
},
{
"type": "table",
"table_body": table_md,
"table_caption": ["Table 1: Monthly query volume, hybrid accuracy, and average latency."],
"table_footnote": ["Synthetic measurements created for a Colab tutorial."],
"page_idx": 0,
},
{
"type": "equation",
"latex": r"Score(q,d)=\alpha \cdot Sim_{text}(q,d)+\beta \cdot Sim_{graph}(q,d)+\gamma \cdot Sim_{visual}(q,d)",
"text": (
"Weighted multimodal retrieval score. Alpha controls text similarity, beta controls graph relationship "
"similarity, and gamma controls visual similarity."
),
"page_idx": 0,
},
{
"type": "image",
"img_path": str(chart_path.resolve()),
"image_caption": ["Figure 1: Multimodal RAG usage and quality trend."],
"image_footnote": ["The line chart is synthetic and generated inside this tutorial."],
"page_idx": 0,
},
{
"type": "text",
"text": (
"The key finding is that hybrid retrieval is preferred for cross-modal questions. "
"Local retrieval is useful for entity-specific lookup, global retrieval is useful for broader themes, "
"and naive retrieval is a simpler baseline. In this report, hybrid accuracy rises from 0.71 in January "
"to 0.91 in June, while average latency drops from 980 milliseconds to 730 milliseconds."
),
"page_idx": 1,
},
]
content_list_path = ASSET_DIR / "content_list.json"
with open(content_list_path, "w", encoding="utf-8") as f:
json.dump(content_list, f, indent=2, ensure_ascii=False)
print(f"Saved content list: {content_list_path}")
We convert the synthetic report into RAG-Anything’s direct content_list format. We represent text, tables, equations, and images as separate structured blocks with captions, footnotes, page indexes, and image paths. We save this list of multimodal content as JSON so the workflow remains transparent and reusable.
Defining OpenAI Chat, Vision, and Embedding Functions
Copy CodeCopiedUse a different Browser
print("\n[7/10] Defining clean OpenAI model and embedding functions...")
async def llm_model_func(prompt, system_prompt=None, history_messages=None, **kwargs):
messages = []
if system_prompt:
messages.append({"role": "system", "content": str(system_prompt)})
for msg in history_messages or []:
if isinstance(msg, dict) and "role" in msg and "content" in msg:
messages.append(msg)
messages.append({"role": "user", "content": str(prompt)})
allowed_kwargs = {}
for key in ["temperature", "top_p", "max_tokens", "response_format"]:
if key in kwargs and kwargs[key] is not None:
allowed_kwargs[key] = kwargs[key]
response = await openai_client.chat.completions.create(
model=LLM_MODEL,
messages=messages,
**allowed_kwargs,
)
return response.choices[0].message.content or ""
async def vision_model_func(
prompt,
system_prompt=None,
history_messages=None,
image_data=None,
messages=None,
**kwargs,
):
allowed_kwargs = {}
for key in ["temperature", "top_p", "max_tokens", "response_format"]:
if key in kwargs and kwargs[key] is not None:
allowed_kwargs[key] = kwargs[key]
if messages:
clean_messages = [m for m in messages if m is not None]
response = await openai_client.chat.completions.create(
model=VISION_MODEL,
messages=clean_messages,
**allowed_kwargs,
)
return response.choices[0].message.content or ""
built_messages = []
if system_prompt:
built_messages.append({"role": "system", "content": str(system_prompt)})
for msg in history_messages or []:
if isinstance(msg, dict) and "role" in msg and "content" in msg:
built_messages.append(msg)
if image_data:
built_messages.append(
{
"role": "user",
"content": [
{"type": "text", "text": str(prompt)},
{
"type": "image_url",
"image_url": {"url": f"data:image/jpeg;base64,{image_data}"},
},
],
}
)
else:
built_messages.append({"role": "user", "content": str(prompt)})
response = await openai_client.chat.completions.create(
model=VISION_MODEL,
messages=built_messages,
**allowed_kwargs,
)
return response.choices[0].message.content or ""
async def openai_embedding_func(texts, **kwargs):
if isinstance(texts, str):
texts = [texts]
texts = [str(t) for t in texts]
response = await openai_client.embeddings.create(
model=EMBEDDING_MODEL,
input=texts,
)
vectors = [item.embedding for item in response.data]
return np.array(vectors, dtype=np.float32)
embedding_func = EmbeddingFunc(
embedding_dim=EMBEDDING_DIM,
max_token_size=8192,
func=openai_embedding_func,
)
print("Model and embedding functions ready.")
We define clean OpenAI-powered functions for text generation, vision generation, and embedding creation. We handle system prompts, chat history, multimodal image inputs, and optional model parameters in a controlled way. We then wrap the embedding function with LightRAG’s EmbeddingFunc so RAG-Anything can use it during indexing and retrieval.
Initializing RAG-Anything and Running Hybrid Retrieval
Copy CodeCopiedUse a different Browser
print("\n[8/10] Initializing RAG-Anything...")
config = RAGAnythingConfig(
working_dir=str(WORKING_DIR),
parser=PARSER_FOR_FULL_PARSE,
parse_method=PARSE_METHOD,
enable_image_processing=True,
enable_table_processing=True,
enable_equation_processing=True,
)
rag = RAGAnything(
config=config,
llm_model_func=llm_model_func,
vision_model_func=vision_model_func,
embedding_func=embedding_func,
)
async def maybe_await(value):
if inspect.isawaitable(value):
return await value
return value
if hasattr(rag, "initialize_storages"):
try:
await maybe_await(rag.initialize_storages())
print("RAG-Anything storages initialized.")
except Exception as e:
print("Storage initialization skipped or already handled:", repr(e))
print(f"Working directory: {WORKING_DIR}")
print("\n[9/10] Inserting multimodal content and running retrieval queries...")
async def insert_demo_content():
await rag.insert_content_list(
content_list=content_list,
file_path=str(report_pdf_path.name),
split_by_character=None,
split_by_character_only=False,
doc_id="synthetic-multimodal-rag-report",
display_stats=True,
)
await insert_demo_content()
print("Insertion complete.")
queries = [
"What is the main purpose of the multimodal RAG report?",
"How did hybrid accuracy and latency change from January to June?",
"Why is hybrid retrieval better than naive retrieval for this report?",
"What does the weighted multimodal score equation mean?",
]
async def safe_aquery(question, mode="hybrid", vlm_enhanced=False):
try:
return await rag.aquery(question, mode=mode, vlm_enhanced=vlm_enhanced)
except TypeError:
return await rag.aquery(question, mode=mode)
async def run_query_suite():
results = []
for mode in ["naive", "local", "global", "hybrid"]:
print("\n" + "=" * 80)
print(f"QUERY MODE: {mode.upper()}")
print("=" * 80)
for q in queries:
print(f"\nQuestion: {q}")
try:
answer = await safe_aquery(q, mode=mode, vlm_enhanced=False)
except Exception as e:
answer = f"Query failed in mode={mode}: {repr(e)}"
print("\nAnswer:")
print(answer)
print("-" * 80)
results.append(
{
"mode": mode,
"question": q,
"answer_preview": str(answer)[:700],
}
)
return pd.DataFrame(results)
query_results_df = await run_query_suite()
print("\nQuery result preview:")
display(query_results_df)
We initialize RAG-Anything with the working directory, parser settings, and multimodal processing options enabled. We insert the prepared multimodal content list into the system and let RAG-Anything process the text, table, equation, and image blocks. We then run multiple ret
関連記事
今日のまとめ
AI日報で今日の重要ニュースをまとめ読み