diff --git a/.gitignore b/.gitignore index 986be94ec..78dcd11e9 100644 --- a/.gitignore +++ b/.gitignore @@ -52,3 +52,6 @@ next-env.d.ts # git worktrees .worktrees/ + +# CodeGraph semantic index (auto-generated) +/.codegraph/ diff --git a/confluence-mdx/.gitignore b/confluence-mdx/.gitignore index e0fcd0c34..e817fc2a5 100644 --- a/confluence-mdx/.gitignore +++ b/confluence-mdx/.gitignore @@ -8,5 +8,4 @@ /bin/mdx_to_storage/__pycache__/ /tests/__pycache__/ /tests/test_mdx_to_storage/__pycache__/ -/var/list.txt /reports/ diff --git a/confluence-mdx/README.md b/confluence-mdx/README.md index d63df558a..9d1c7c221 100644 --- a/confluence-mdx/README.md +++ b/confluence-mdx/README.md @@ -80,10 +80,10 @@ pip3 install requests beautifulsoup4 pyyaml 1. `confluence-mdx/var/`에 Confluence 문서 데이터를 저장합니다. - 개별 문서마다 `/page.xhtml`, `/page.v1.yaml` 등을 저장합니다. - - 전체 문서 목록을 `var/pages.yaml`에 저장합니다. + - 전체 문서 목록을 `var/pages..yaml`에 저장합니다 (예: `var/pages.qm.yaml`). - `fetch_cli.py`를 사용합니다. 2. `src/content/ko/` 아래에 MDX 문서를 생성합니다. - - `var/pages.yaml`을 기반으로 모든 페이지를 변환합니다. + - `var/pages..yaml`을 기반으로 모든 페이지를 변환합니다. - `convert_all.py`를 사용합니다. 무작정 따라해 보기 @@ -130,8 +130,7 @@ bin/fetch_cli.py --attachments bin/fetch_cli.py --local # 로컬에서 fetch_cli.py 개선 과정에서, 반복실행할 때 사용하는 명령입니다. -# 또는, var/list.txt 를 업데이트하고자 하는 경우에 실행합니다. -bin/fetch_cli.py --local >var/list.txt +bin/fetch_cli.py --local # 특정 페이지 ID와 하위 문서를 내려받습니다. 첨부파일을 포함하여 내려받습니다. # 일부 문서만 변경한 경우, 해당 문서와 하위 페이지를 API 로 내려받아 저장할 때 사용합니다. @@ -156,23 +155,22 @@ bin/fetch_cli.py --log-level DEBUG 실행 결과: - `var/` 디렉토리에 문서 데이터가 저장됩니다. - 각 페이지 ID에 해당하는 디렉토리에 `page.yaml`과 `page.xhtml` 파일이 저장됩니다. -- `>list.txt`로 stdout 을 redirect 하면, `list.txt` 파일에 문서 목록이 저장됩니다. ### 2. 전체 변환 (convert_all.py) -`convert_all.py`는 `var/pages.yaml`을 기반으로 모든 페이지를 MDX로 변환하는 스크립트입니다. +`convert_all.py`는 `var/pages..yaml`을 기반으로 모든 페이지를 MDX로 변환하는 스크립트입니다. 변환 전에 번역 누락을 자동 검증합니다. 실행 방법: ```bash -# 전체 변환 (번역 검증 포함) +# 전체 변환 (번역 검증 포함, 기본: --sync-code qm) bin/convert_all.py +# QCP Space 변환 +bin/convert_all.py --sync-code qcp + # 번역 검증만 수행 (변환하지 않음) bin/convert_all.py --verify-translations - -# 디버깅용 list.txt / list.en.txt 생성 (변환도 함께 수행) -bin/convert_all.py --generate-list ``` 실행 결과: diff --git a/confluence-mdx/bin/convert_all.py b/confluence-mdx/bin/convert_all.py index 95a447545..753c436a2 100755 --- a/confluence-mdx/bin/convert_all.py +++ b/confluence-mdx/bin/convert_all.py @@ -6,9 +6,9 @@ 하나의 명령으로 대체합니다. Usage: - bin/convert_all.py # 전체 변환 + bin/convert_all.py # 전체 변환 (기본: --sync-code qm) + bin/convert_all.py --sync-code qcp # QCP Space 변환 bin/convert_all.py --verify-translations # 번역 검증만 수행 - bin/convert_all.py --generate-list # list.txt / list.en.txt 생성 """ import argparse @@ -77,36 +77,8 @@ def verify_translations(pages: List[Dict], translations: Dict[str, str]) -> List return missing -def generate_list_files(pages: List[Dict], output_dir: str) -> None: - """Generate list.txt (Korean) and list.en.txt (English) from pages.yaml.""" - list_txt_lines = [] - list_en_lines = [] - - # Skip the root page (first entry, single breadcrumb) - root_page_id = pages[0]['page_id'] if pages else None - - for page in pages: - if page['page_id'] == root_page_id: - continue - breadcrumbs = page.get('breadcrumbs', []) - breadcrumbs_en = page.get('breadcrumbs_en', []) - list_txt_lines.append(f"{page['page_id']}\t{' />> '.join(breadcrumbs)}\n") - list_en_lines.append(f"{page['page_id']}\t{' />> '.join(breadcrumbs_en)}\n") - - list_txt_path = os.path.join(output_dir, 'list.txt') - list_en_path = os.path.join(output_dir, 'list.en.txt') - - with open(list_txt_path, 'w', encoding='utf-8') as f: - f.writelines(list_txt_lines) - print(f"Generated {list_txt_path} ({len(list_txt_lines)} entries)", file=sys.stderr) - - with open(list_en_path, 'w', encoding='utf-8') as f: - f.writelines(list_en_lines) - print(f"Generated {list_en_path} ({len(list_en_lines)} entries)", file=sys.stderr) - - def convert_all(pages: List[Dict], var_dir: str, output_base_dir: str, public_dir: str, - log_level: str) -> int: + log_level: str, pages_yaml: str = '') -> int: """Run converter/cli.py for each page. Returns number of failures.""" # Skip the root page root_page_id = pages[0]['page_id'] if pages else None @@ -148,6 +120,8 @@ def convert_all(pages: List[Dict], var_dir: str, output_base_dir: str, public_di f'--attachment-dir={attachment_dir}', f'--log-level={log_level}', ] + if pages_yaml: + cmd.append(f'--pages-yaml={pages_yaml}') print(f"[{i}/{total}] {page_id} → {output_file}", file=sys.stderr) result = subprocess.run(cmd, capture_output=True, text=True) @@ -162,8 +136,10 @@ def main(): parser = argparse.ArgumentParser( description='Batch convert all Confluence pages to MDX using pages.yaml' ) - parser.add_argument('--pages-yaml', default='var/pages.yaml', - help='Path to pages.yaml (default: var/pages.yaml)') + parser.add_argument('--sync-code', default='qm', + help='Sync profile code; used to auto-derive --pages-yaml (default: %(default)s)') + parser.add_argument('--pages-yaml', default=None, + help='Path to pages YAML (default: var/pages..yaml)') parser.add_argument('--var-dir', default='var', help='Directory containing page data (default: var)') parser.add_argument('--output-dir', default='target/ko', @@ -174,13 +150,15 @@ def main(): help='Path to translations file') parser.add_argument('--verify-translations', action='store_true', help='Verify translation coverage and exit') - parser.add_argument('--generate-list', action='store_true', - help='Generate list.txt / list.en.txt for debugging') parser.add_argument('--log-level', default='warning', choices=['debug', 'info', 'warning', 'error', 'critical'], help='Log level for converter/cli.py (default: warning)') args = parser.parse_args() + # Auto-derive pages-yaml from sync-code if not explicitly provided + if args.pages_yaml is None: + args.pages_yaml = f'var/pages.{args.sync_code}.yaml' + # Resolve relative paths against project root (confluence-mdx/) args.pages_yaml = _resolve(args.pages_yaml) args.var_dir = _resolve(args.var_dir) @@ -208,12 +186,9 @@ def main(): if args.verify_translations: sys.exit(0) - # --generate-list: generate list files - if args.generate_list: - generate_list_files(pages, args.var_dir) - # Run conversions - failures = convert_all(pages, args.var_dir, args.output_dir, args.public_dir, args.log_level) + failures = convert_all(pages, args.var_dir, args.output_dir, args.public_dir, args.log_level, + pages_yaml=args.pages_yaml) if failures: print(f"\nCompleted with {failures} failure(s) out of {len(pages)} pages", file=sys.stderr) diff --git a/confluence-mdx/bin/converter/cli.py b/confluence-mdx/bin/converter/cli.py index 416b98458..5586722b0 100755 --- a/confluence-mdx/bin/converter/cli.py +++ b/confluence-mdx/bin/converter/cli.py @@ -123,6 +123,8 @@ def main(): parser.add_argument('--language', choices=['ko', 'ja', 'en'], help='언어 코드를 명시적으로 지정 (미지정 시 출력 경로에서 자동 감지)') + parser.add_argument('--pages-yaml', + help='pages..yaml 경로 (미지정 시 input_dir/../pages.qm.yaml → pages.yaml 순으로 탐색)') parser.add_argument('--page-dir', help='page.v1.yaml 등 페이지 데이터 디렉토리 (기본: input 파일의 디렉토리)') parser.add_argument('--log-level', @@ -176,8 +178,15 @@ def main(): # 원본 XHTML 보존 — sidecar mapping에서 사용 xhtml_original = html_content - # Load pages.yaml to get the current page's path - pages_yaml_path = os.path.join(input_dir, '..', 'pages.yaml') + # Load pages YAML for internal link resolution and _meta.ts generation. + # Priority: --pages-yaml arg > pages.qm.yaml (new naming) > pages.yaml (legacy). + var_dir = os.path.join(input_dir, '..') + if args.pages_yaml: + pages_yaml_path = args.pages_yaml + else: + pages_yaml_path = os.path.join(var_dir, 'pages.qm.yaml') + if not os.path.exists(pages_yaml_path): + pages_yaml_path = os.path.join(var_dir, 'pages.yaml') load_pages_yaml(pages_yaml_path, PAGES_BY_TITLE, PAGES_BY_ID) # Load page.v1.yaml: --page-dir 우선, 없으면 input_dir에서 탐색 diff --git a/confluence-mdx/bin/fetch/api_client.py b/confluence-mdx/bin/fetch/api_client.py index a253faacf..51967ca08 100644 --- a/confluence-mdx/bin/fetch/api_client.py +++ b/confluence-mdx/bin/fetch/api_client.py @@ -53,14 +53,28 @@ def get_page_data_v1(self, page_id: str) -> Optional[Dict]: url = f"{self.config.base_url}/rest/api/content/{page_id}?expand=title,ancestors,body.storage,body.view" return self.make_request(url, "V1 API page data") - def get_page_data_v2(self, page_id: str) -> Optional[Dict]: - """Get page data using V2 API""" - url = f"{self.config.base_url}/api/v2/pages/{page_id}?body-format=atlas_doc_format" + def get_page_data_v2(self, page_id: str, content_type: str = "page") -> Optional[Dict]: + """Get page data using V2 API. + + Uses /api/v2/folders/{id} for folder content type, /api/v2/pages/{id} otherwise. + """ + if content_type == "folder": + url = f"{self.config.base_url}/api/v2/folders/{page_id}" + else: + url = f"{self.config.base_url}/api/v2/pages/{page_id}?body-format=atlas_doc_format" return self.make_request(url, "V2 API page data") - def get_child_pages(self, page_id: str) -> Optional[Dict]: - """Get child pages using V2 API""" - url = f"{self.config.base_url}/api/v2/pages/{page_id}/children?type=page&limit=100" + def get_child_pages(self, page_id: str, content_type: str = "page") -> Optional[Dict]: + """Get child pages using V2 API. + + Uses /api/v2/folders/{id}/children for folder content type, + /api/v2/pages/{id}/children for page content type. + The type=page filter is omitted so that folder children are also included. + """ + if content_type == "folder": + url = f"{self.config.base_url}/api/v2/folders/{page_id}/children?limit=100" + else: + url = f"{self.config.base_url}/api/v2/pages/{page_id}/children?limit=100" return self.make_request(url, "V2 API child pages") def get_attachments(self, page_id: str) -> Optional[Dict]: diff --git a/confluence-mdx/bin/fetch/config.py b/confluence-mdx/bin/fetch/config.py index c402f0508..d55beeca1 100644 --- a/confluence-mdx/bin/fetch/config.py +++ b/confluence-mdx/bin/fetch/config.py @@ -15,8 +15,16 @@ class Config: """Centralized configuration management""" base_url: str = "https://querypie.atlassian.net/wiki" space_key: str = "QM" # Confluence space key + sync_code: str = "qm" # Sync profile code (see fetch/sync_profiles.py) days: Optional[int] = None # Number of days to look back (None = auto-detect from .fetch_state.yaml) default_start_page_id: str = "608501837" # Root Page ID of "QueryPie Docs" (for breadcrumbs) + root_content_type: str = "page" + """Confluence content type of the root page ('page' or 'folder'). + + Used by Stage 1 when page.v2.yaml does not yet exist (first run on a clean + environment), so the correct API endpoint is selected from the start. + Populated from SyncProfile.root_content_type in fetch_cli.py. + """ quick_start_page_id: str = "544375784" # QueryPie Overview having less children default_output_dir: str = "var" cache_dir: str = "cache" @@ -26,6 +34,11 @@ class Config: download_attachments: bool = False mode: str = "recent" # Mode: "local", "remote", or "recent" + @property + def pages_yaml_filename(self) -> str: + """Filename for pages YAML, derived from sync_code.""" + return f"pages.{self.sync_code}.yaml" + def __post_init__(self): if self.email is None: self.email = os.environ.get('ATLASSIAN_USERNAME', 'your-email@example.com') diff --git a/confluence-mdx/bin/fetch/processor.py b/confluence-mdx/bin/fetch/processor.py index 7c8a52622..d9145ae96 100644 --- a/confluence-mdx/bin/fetch/processor.py +++ b/confluence-mdx/bin/fetch/processor.py @@ -175,8 +175,7 @@ def run(self) -> None: self.logger.info(f"Created output directory: {self.config.default_output_dir}") # Prepare output file path - output_yaml_path = os.path.join(self.config.default_output_dir, "pages.yaml") - output_list_path = os.path.join(self.config.default_output_dir, "list.txt") + output_yaml_path = os.path.join(self.config.default_output_dir, self.config.pages_yaml_filename) start_page_id = self.config.default_start_page_id @@ -232,9 +231,7 @@ def run(self) -> None: ) # Download each page through all 4 stages and output to stdout - # Store downloaded pages for list.txt self.logger.warning(f"Downloading {len(modified_pages)} recently modified pages") - downloaded_list_lines = [] skipped_count = 0 for entry in modified_pages: page_id = entry["id"] @@ -257,8 +254,6 @@ def run(self) -> None: # Output to stdout during download breadcrumbs_str = " />> ".join(page.breadcrumbs) if page.breadcrumbs else "" print(f"{page.page_id}\t{breadcrumbs_str}") - # Store for list.txt (only downloaded pages) - downloaded_list_lines.append(f"{page.page_id}\t{breadcrumbs_str}\n") except Exception as e: self.logger.error(f"Error downloading page ID {page_id}: {str(e)}") continue @@ -267,38 +262,25 @@ def run(self) -> None: self.logger.warning(f"Skipped {skipped_count} pages (already up-to-date)") # After downloading, process like local mode (hierarchical traversal from start_page_id) - # Generate pages.yaml and list.txt with full hierarchical tree (like --local mode) + # Generate pages.yaml with full hierarchical tree (like --local mode) # No stdout output in this phase (like --local mode) self.logger.warning(f"Processing page tree from start page ID {start_page_id} (local mode)") page_count = 0 yaml_entries = [] - list_lines = [] for page in self.fetch_page_tree_recursive(start_page_id, start_page_id, use_local=True): if page: - breadcrumbs_str = " />> ".join(page.breadcrumbs) if page.breadcrumbs else "" - # No stdout output in local mode - # Exclude start_page_id from list.txt (root page is not converted to MDX) - if page.page_id != start_page_id: - list_lines.append(f"{page.page_id}\t{breadcrumbs_str}\n") page_count += 1 yaml_entries.append(page.to_dict()) elif self.config.mode == "local": # --local mode: Process existing local files hierarchically from start_page_id - # No stdout output in local mode self.logger.warning(f"Local mode: Processing page tree from start page ID {start_page_id}") page_count = 0 yaml_entries = [] - list_lines = [] for page in self.fetch_page_tree_recursive(start_page_id, start_page_id, use_local=True): if page: - breadcrumbs_str = " />> ".join(page.breadcrumbs) if page.breadcrumbs else "" - # No stdout output in local mode - # Exclude start_page_id from list.txt (root page is not converted to MDX) - if page.page_id != start_page_id: - list_lines.append(f"{page.page_id}\t{breadcrumbs_str}\n") page_count += 1 yaml_entries.append(page.to_dict()) @@ -308,15 +290,13 @@ def run(self) -> None: self.logger.warning(f"Remote mode: Processing page tree from start page ID {start_page_id} via API") page_count = 0 yaml_entries = [] - list_lines = [] for page in self.fetch_page_tree_recursive(start_page_id, start_page_id, use_local=False): if page: - breadcrumbs_str = " />> ".join(page.breadcrumbs) if page.breadcrumbs else "" - # Exclude start_page_id from stdout and list.txt (root page is not converted to MDX) + # Exclude start_page_id from stdout (root page is not converted to MDX) if page.page_id != start_page_id: + breadcrumbs_str = " />> ".join(page.breadcrumbs) if page.breadcrumbs else "" print(f"{page.page_id}\t{breadcrumbs_str}") - list_lines.append(f"{page.page_id}\t{breadcrumbs_str}\n") page_count += 1 yaml_entries.append(page.to_dict()) @@ -348,11 +328,6 @@ def run(self) -> None: self.file_manager.save_yaml(output_yaml_path, yaml_entries) self.logger.info(f"YAML data saved to {output_yaml_path}") - # Save list.txt file - if list_lines: - self.file_manager.save_file(output_list_path, "".join(list_lines)) - self.logger.info(f"List file saved to {output_list_path}") - self.logger.info(f"Completed processing {page_count} pages") except Exception as e: self.logger.error(f"Error in main execution: {str(e)}") diff --git a/confluence-mdx/bin/fetch/stages.py b/confluence-mdx/bin/fetch/stages.py index 55068f384..eb28a7fa9 100644 --- a/confluence-mdx/bin/fetch/stages.py +++ b/confluence-mdx/bin/fetch/stages.py @@ -44,6 +44,20 @@ def process(self, page_id: str) -> None: directory = self.get_page_directory(page_id) self.file_manager.ensure_directory(directory) + # Determine content type for API routing: + # 1. Prefer the type stored in page.v2.yaml (present on re-runs). + # 2. Fall back to config.root_content_type when processing the root + # page on a clean environment (page.v2.yaml does not yet exist). + # 3. Default to "page" for all other pages without cached data. + v2_path = os.path.join(self.get_page_directory(page_id), "page.v2.yaml") + existing_v2 = self.file_manager.load_yaml(v2_path) if os.path.exists(v2_path) else None + if existing_v2: + content_type = existing_v2.get("type", "page") + elif page_id == self.config.default_start_page_id: + content_type = self.config.root_content_type + else: + content_type = "page" + api_operations = [ { 'operation': lambda: self.api_client.get_page_data_v1(page_id), @@ -51,12 +65,12 @@ def process(self, page_id: str) -> None: 'filename': "page.v1.yaml" }, { - 'operation': lambda: self.api_client.get_page_data_v2(page_id), + 'operation': lambda: self.api_client.get_page_data_v2(page_id, content_type), 'description': "V2 API page data", 'filename': "page.v2.yaml" }, { - 'operation': lambda: self.api_client.get_child_pages(page_id), + 'operation': lambda: self.api_client.get_child_pages(page_id, content_type), 'description': "V2 API child pages", 'filename': "children.v2.yaml" }, @@ -291,7 +305,7 @@ def _build_breadcrumbs( filtered_ancestors: List[str] = [] found_start_page = False for ancestor in ancestors: - if ancestor.get("type") == "page": + if ancestor.get("type") in ("page", "folder"): if ancestor["id"] == start_page_id: found_start_page = True continue @@ -304,7 +318,7 @@ def _build_breadcrumbs( else: # Include all ancestors ancestor_titles = [ - clean_text(ancestor["title"]) for ancestor in ancestors if ancestor.get("type") == "page" and "title" in ancestor + clean_text(ancestor["title"]) for ancestor in ancestors if ancestor.get("type") in ("page", "folder") and "title" in ancestor ] path = ancestor_titles + [title] diff --git a/confluence-mdx/bin/fetch/sync_profiles.py b/confluence-mdx/bin/fetch/sync_profiles.py new file mode 100644 index 000000000..a345ffa5f --- /dev/null +++ b/confluence-mdx/bin/fetch/sync_profiles.py @@ -0,0 +1,32 @@ +"""Sync profile definitions for each Confluence Space.""" + +from dataclasses import dataclass + + +@dataclass +class SyncProfile: + """Configuration for a single Confluence Space sync target.""" + code: str + space_key: str + start_page_id: str + root_content_type: str = "page" + """Confluence content type of the root page ('page' or 'folder'). + + Used by Stage 1 when page.v2.yaml does not yet exist (e.g. first run on a + clean environment) so the correct API endpoint is called from the start. + """ + + +SYNC_PROFILES: dict[str, SyncProfile] = { + "qm": SyncProfile( + code="qm", + space_key="QM", + start_page_id="608501837", # QueryPie Docs 루트 + ), + "qcp": SyncProfile( + code="qcp", + space_key="QCP", + start_page_id="887849063", # QCP Space 루트 (https://querypie.atlassian.net/wiki/spaces/QCP/folder/887849063) + root_content_type="folder", # 887849063 is a Confluence folder, not a page + ), +} diff --git a/confluence-mdx/bin/fetch_cli.py b/confluence-mdx/bin/fetch_cli.py index 66d89bfbd..0f103c49e 100755 --- a/confluence-mdx/bin/fetch_cli.py +++ b/confluence-mdx/bin/fetch_cli.py @@ -40,6 +40,7 @@ from fetch.config import Config from fetch.processor import ConfluencePageProcessor +from fetch.sync_profiles import SYNC_PROFILES def main(): @@ -48,12 +49,15 @@ def main(): parser = argparse.ArgumentParser( description="Generate a list of pages from a Confluence space" ) - parser.add_argument("--space-key", default=Config().space_key, - help=f"Confluence space key (default: %(default)s)") + parser.add_argument("--sync-code", default="qm", + choices=list(SYNC_PROFILES.keys()), + help="Sync profile code (default: %(default)s)") + parser.add_argument("--space-key", default=None, + help="Confluence space key (overrides sync profile default)") parser.add_argument("--days", type=int, default=None, help="Number of days to look back for modified pages (default: auto-detect from .fetch_state.yaml, fallback: 21)") - parser.add_argument("--start-page-id", default=Config().default_start_page_id, - help="Root page ID for building breadcrumbs (default: %(default)s)") + parser.add_argument("--start-page-id", default=None, + help="Root page ID for building breadcrumbs (overrides sync profile default)") parser.add_argument("--base-url", default=Config().base_url, help="Confluence base URL (default: %(default)s)") parser.add_argument("--email", default=Config().email, help="Confluence email for authentication") parser.add_argument("--api-token", default=Config().api_token, help="Confluence API token for authentication") @@ -88,15 +92,23 @@ def main(): # Determine mode (default to "recent" if not specified) mode = args.mode if args.mode else "recent" + # Load sync profile and resolve space_key / start_page_id / root_content_type + profile = SYNC_PROFILES.get(args.sync_code) + space_key = args.space_key or (profile.space_key if profile else Config().space_key) + start_page_id = args.start_page_id or (profile.start_page_id if profile else Config().default_start_page_id) + root_content_type = profile.root_content_type if profile else "page" + # Create configuration config = Config( base_url=args.base_url, - space_key=args.space_key, + space_key=space_key, + sync_code=args.sync_code, days=args.days, email=args.email, api_token=args.api_token, default_output_dir=args.output_dir, - default_start_page_id=args.start_page_id, + default_start_page_id=start_page_id, + root_content_type=root_content_type, download_attachments=args.attachments, mode=mode ) diff --git a/confluence-mdx/bin/find_mdx_with_text.py b/confluence-mdx/bin/find_mdx_with_text.py index 2d25e45ae..24e1ba21c 100755 --- a/confluence-mdx/bin/find_mdx_with_text.py +++ b/confluence-mdx/bin/find_mdx_with_text.py @@ -24,14 +24,20 @@ import yaml +# Ensure bin/ is on sys.path for fetch package imports +_BIN_DIR = Path(__file__).resolve().parent # confluence-mdx/bin/ +if str(_BIN_DIR) not in sys.path: + sys.path.insert(0, str(_BIN_DIR)) + +from fetch.sync_profiles import SYNC_PROFILES + # Configure logging logging.basicConfig( level=logging.INFO, format='%(levelname)s: %(message)s' ) -# Confluence base URL -CONFLUENCE_BASE_URL = "https://querypie.atlassian.net/wiki/spaces/QM/pages" +CONFLUENCE_BASE = "https://querypie.atlassian.net/wiki/spaces" def find_mdx_files_with_text(content_dir: Path, search_text: str) -> List[Path]: @@ -165,17 +171,9 @@ def find_page_by_path(pages_by_path: Dict, mdx_path: List[str]) -> Optional[Dict return None -def generate_confluence_link(page_id: str) -> str: - """ - Generate Confluence document link - - Args: - page_id: Confluence page ID - - Returns: - Confluence URL - """ - return f"{CONFLUENCE_BASE_URL}/{page_id}" +def generate_confluence_link(page_id: str, space_key: str) -> str: + """Generate Confluence document link for the given space.""" + return f"{CONFLUENCE_BASE}/{space_key}/pages/{page_id}" def main(): @@ -194,11 +192,17 @@ def main(): default='src/content/ko', help='Content directory to search (default: src/content/ko)' ) + parser.add_argument( + '--sync-code', + default='qm', + choices=list(SYNC_PROFILES.keys()), + help='Sync profile code; pages..yaml을 로드합니다 (기본: %(default)s)' + ) parser.add_argument( '--pages-yaml', type=str, - default='var/pages.yaml', - help='Path to pages.yaml file (default: var/pages.yaml)' + default=None, + help='Path to pages YAML file (기본: var/pages..yaml)' ) parser.add_argument( '--workspace-root', @@ -218,7 +222,13 @@ def main(): # Resolve paths content_dir = workspace_root / args.content_dir - pages_yaml_path = workspace_root / 'confluence-mdx' / args.pages_yaml + confluence_mdx_dir = workspace_root / 'confluence-mdx' + if args.pages_yaml: + pages_yaml_path = confluence_mdx_dir / args.pages_yaml + else: + pages_yaml_path = confluence_mdx_dir / f'var/pages.{args.sync_code}.yaml' + if not pages_yaml_path.exists(): + pages_yaml_path = confluence_mdx_dir / 'var/pages.yaml' logging.info(f"Searching for text: '{args.search_text}'") logging.info(f"Content directory: {content_dir}") @@ -240,6 +250,9 @@ def main(): logging.error("No pages loaded from pages.yaml. Cannot generate links.") return 1 + # Derive space_key from sync profile + space_key = SYNC_PROFILES[args.sync_code].space_key + # Find matching pages and generate links results = [] for mdx_file in matching_files: @@ -250,7 +263,7 @@ def main(): page_id = page_info.get('page_id') title = page_info.get('title', 'Unknown') title_orig = page_info.get('title_orig', 'Unknown') - confluence_link = generate_confluence_link(page_id) + confluence_link = generate_confluence_link(page_id, space_key) results.append({ 'mdx_file': mdx_file.relative_to(workspace_root), diff --git a/confluence-mdx/bin/image_status.py b/confluence-mdx/bin/image_status.py index dff5c49c2..2e677b42c 100755 --- a/confluence-mdx/bin/image_status.py +++ b/confluence-mdx/bin/image_status.py @@ -20,6 +20,13 @@ import yaml +# Ensure bin/ is on sys.path for local package imports (fetch.sync_profiles) +_SCRIPT_DIR = Path(__file__).resolve().parent +if str(_SCRIPT_DIR) not in sys.path: + sys.path.insert(0, str(_SCRIPT_DIR)) + +from fetch.sync_profiles import SYNC_PROFILES + def read_build_date(workdir: Path) -> str: """Read image build date from .build-date file.""" @@ -29,12 +36,15 @@ def read_build_date(workdir: Path) -> str: return "unknown" -def read_fetch_state(var_dir: Path) -> dict: - """Find and read fetch_state.yaml.""" - for state_file in var_dir.glob("*/fetch_state.yaml"): +def read_fetch_states(var_dir: Path) -> list[tuple[str, dict]]: + """Find and read all fetch_state.yaml files, returning [(dir_name, state), ...].""" + states = [] + for state_file in sorted(var_dir.glob("*/fetch_state.yaml")): + dir_name = state_file.parent.name with open(state_file) as f: - return yaml.safe_load(f) or {} - return {} + state = yaml.safe_load(f) or {} + states.append((dir_name, state)) + return states def scan_pages(var_dir: Path) -> list[dict]: @@ -85,13 +95,16 @@ def format_report(workdir: Path, var_dir: Path, top_n: int) -> str: build_date = read_build_date(workdir) lines.append(f" Build Date : {build_date}") - # Fetch state - state = read_fetch_state(var_dir) - if state: - lines.append(f" Last Modified : {state.get('last_modified_seen', '?')}") - lines.append(f" Last Recent Fetch: {state.get('last_recent_fetch', '?')}") - lines.append(f" Last Full Fetch : {state.get('last_full_fetch', '?')}") - lines.append(f" Pages Fetched : {state.get('pages_fetched', '?')}") + # Fetch state (per space) + fetch_states = read_fetch_states(var_dir) + if fetch_states: + for root_id, state in fetch_states: + code = next((p.code for p in SYNC_PROFILES.values() if p.start_page_id == root_id), root_id) + lines.append(f" Fetch State [{code} / {root_id}]:") + lines.append(f" Last Modified : {state.get('last_modified_seen', '?')}") + lines.append(f" Last Recent Fetch: {state.get('last_recent_fetch', '?')}") + lines.append(f" Last Full Fetch : {state.get('last_full_fetch', '?')}") + lines.append(f" Pages Fetched : {state.get('pages_fetched', '?')}") else: lines.append(" Fetch State : not found") diff --git a/confluence-mdx/bin/mdx_to_storage/link_resolver.py b/confluence-mdx/bin/mdx_to_storage/link_resolver.py index 8c520f501..24b2cf893 100644 --- a/confluence-mdx/bin/mdx_to_storage/link_resolver.py +++ b/confluence-mdx/bin/mdx_to_storage/link_resolver.py @@ -5,12 +5,20 @@ from dataclasses import dataclass, field import posixpath import re +import sys from pathlib import Path from typing import Any, Optional from urllib.parse import unquote import yaml +# Ensure bin/ is on sys.path for fetch package imports +_BIN_DIR = Path(__file__).resolve().parent.parent # confluence-mdx/bin/ +if str(_BIN_DIR) not in sys.path: + sys.path.insert(0, str(_BIN_DIR)) + +from fetch.sync_profiles import SYNC_PROFILES + _EXTERNAL_SCHEME_RE = re.compile(r"^[a-zA-Z][a-zA-Z0-9+.-]*:") @@ -55,7 +63,11 @@ class LinkResolver: def __init__(self, pages: Optional[list[PageEntry] | Path] = None) -> None: if pages is None: - pages = Path(__file__).resolve().parents[2] / "var" / "pages.yaml" + var_dir = Path(__file__).resolve().parents[2] / "var" + default_code = next(iter(SYNC_PROFILES), "qm") + pages = var_dir / f"pages.{default_code}.yaml" + if not pages.exists(): + pages = var_dir / "pages.yaml" if isinstance(pages, Path): pages = load_pages_yaml(pages) diff --git a/confluence-mdx/bin/reverse_sync_cli.py b/confluence-mdx/bin/reverse_sync_cli.py index d6234990e..5897c141c 100755 --- a/confluence-mdx/bin/reverse_sync_cli.py +++ b/confluence-mdx/bin/reverse_sync_cli.py @@ -101,26 +101,26 @@ def _get_changed_ko_mdx_files(branch: str) -> List[str]: def _resolve_page_id(ko_mdx_path: str) -> str: - """src/content/ko/...mdx 경로에서 pages.yaml을 이용해 page_id를 유도한다.""" + """src/content/ko/...mdx 경로에서 pages.qm.yaml을 이용해 page_id를 유도한다.""" rel = ko_mdx_path.removeprefix('src/content/ko/').removesuffix('.mdx') path_parts = rel.split('/') - pages_path = _PROJECT_DIR / 'var' / 'pages.yaml' + pages_path = _PROJECT_DIR / 'var' / 'pages.qm.yaml' if not pages_path.exists(): - raise ValueError("var/pages.yaml not found") + raise ValueError("var/pages.qm.yaml not found") pages = yaml.safe_load(pages_path.read_text()) for page in pages: if page.get('path') == path_parts: return page['page_id'] - raise ValueError(f"MDX path '{ko_mdx_path}' not found in var/pages.yaml") + raise ValueError(f"MDX path '{ko_mdx_path}' not found in var/pages.qm.yaml") def _resolve_attachment_dir(page_id: str) -> str: - """page_id에서 pages.yaml의 path를 조회하여 attachment-dir를 반환.""" - pages = yaml.safe_load((_PROJECT_DIR / 'var' / 'pages.yaml').read_text()) + """page_id에서 pages.qm.yaml의 path를 조회하여 attachment-dir를 반환.""" + pages = yaml.safe_load((_PROJECT_DIR / 'var' / 'pages.qm.yaml').read_text()) for page in pages: if page['page_id'] == page_id: return '/' + '/'.join(page['path']) - raise ValueError(f"page_id '{page_id}' not found in var/pages.yaml") + raise ValueError(f"page_id '{page_id}' not found in var/pages.qm.yaml") def _detect_language(descriptor: str) -> str: diff --git a/confluence-mdx/bin/unused_attachments.py b/confluence-mdx/bin/unused_attachments.py index 3f4fdaa97..d8751fb58 100755 --- a/confluence-mdx/bin/unused_attachments.py +++ b/confluence-mdx/bin/unused_attachments.py @@ -26,7 +26,14 @@ import yaml # Resolve project root (confluence-mdx/) from bin/unused_attachments.py -_PROJECT_DIR = Path(__file__).resolve().parent.parent # confluence-mdx/ +_BIN_DIR = Path(__file__).resolve().parent # confluence-mdx/bin/ +_PROJECT_DIR = _BIN_DIR.parent # confluence-mdx/ + +# Ensure bin/ is on sys.path for fetch package imports +if str(_BIN_DIR) not in sys.path: + sys.path.insert(0, str(_BIN_DIR)) + +from fetch.sync_profiles import SYNC_PROFILES def normalize_filename(name: str) -> str: @@ -34,9 +41,14 @@ def normalize_filename(name: str) -> str: return unicodedata.normalize('NFC', name) -def load_pages_yaml(var_dir: Path) -> list[dict]: - """var/pages.yaml에서 전체 페이지 목록을 로드합니다.""" - pages_file = var_dir / "pages.yaml" +def load_pages_yaml(var_dir: Path, sync_code: str = "qm") -> list[dict]: + """var/pages..yaml에서 전체 페이지 목록을 로드합니다. + + pages..yaml이 없으면 레거시 pages.yaml을 fallback으로 사용합니다. + """ + pages_file = var_dir / f"pages.{sync_code}.yaml" + if not pages_file.exists(): + pages_file = var_dir / "pages.yaml" if not pages_file.exists(): return [] with open(pages_file, encoding='utf-8') as f: @@ -122,7 +134,8 @@ def build_cross_reference_index(references: dict[str, set[str]], def find_unused_attachments(var_dir: Path, page_ids: Optional[list[str]] = None, - logger: Optional[logging.Logger] = None) -> list[dict]: + logger: Optional[logging.Logger] = None, + sync_code: str = "qm") -> list[dict]: """미사용 첨부파일을 검출합니다. Returns: @@ -133,7 +146,7 @@ def find_unused_attachments(var_dir: Path, # 전체 페이지 목록 결정 if page_ids is None: - pages = load_pages_yaml(var_dir) + pages = load_pages_yaml(var_dir, sync_code) all_page_ids = [p["page_id"] for p in pages] else: all_page_ids = page_ids @@ -261,7 +274,12 @@ def delete_attachments(unused: list[dict], config, logger: logging.Logger) -> tu def main(): parser = argparse.ArgumentParser( - description="Confluence QM Space 첨부파일 사용 여부 검사 및 삭제" + description="Confluence Space 첨부파일 사용 여부 검사 및 삭제" + ) + parser.add_argument( + "--sync-code", default="qm", + choices=list(SYNC_PROFILES.keys()), + help="Sync profile code; pages..yaml을 로드합니다 (기본: %(default)s)" ) parser.add_argument( "--var-dir", default=None, @@ -306,11 +324,11 @@ def main(): page_ids = [pid.strip() for pid in args.page_id.split(",")] # 미사용 첨부파일 검출 - unused = find_unused_attachments(var_dir, page_ids, logger) + unused = find_unused_attachments(var_dir, page_ids, logger, sync_code=args.sync_code) # 전체 첨부파일 수 계산 (보고용) if page_ids is None: - pages = load_pages_yaml(var_dir) + pages = load_pages_yaml(var_dir, args.sync_code) all_page_ids = [p["page_id"] for p in pages] else: all_page_ids = page_ids diff --git a/confluence-mdx/compose.yml b/confluence-mdx/compose.yml index 84c48775d..55d8acc47 100644 --- a/confluence-mdx/compose.yml +++ b/confluence-mdx/compose.yml @@ -49,8 +49,12 @@ services: volumes: # Use translation file from host - ./etc/korean-titles-translations.txt:/workdir/etc/korean-titles-translations.txt - # Mount files in var to host - - ./var/pages.yaml:/workdir/var/pages.yaml + # Mount pages YAML files in var to host (one per Space). + # These files are created/updated by fetch_cli.py and must persist on the + # host so that subsequent runs and host-side tools (convert_all.py, etc.) + # can read the catalog. Add a new line here for each new Space. + - ./var/pages.qm.yaml:/workdir/var/pages.qm.yaml + - ./var/pages.qcp.yaml:/workdir/var/pages.qcp.yaml # Mount output directories to host (matching symlink structure in target/) # target/ko -> ../../src/content/ko - ../src/content/ko:/workdir/target/ko diff --git a/confluence-mdx/scripts/entrypoint.sh b/confluence-mdx/scripts/entrypoint.sh index 9f430744f..f6cadb71d 100755 --- a/confluence-mdx/scripts/entrypoint.sh +++ b/confluence-mdx/scripts/entrypoint.sh @@ -38,14 +38,36 @@ case "${1:-help}" in echo "+ bin/$command $@" exec bin/$command "$@" ;; - full) # Execute full workflow + full) # Execute full workflow for a single Space print_image_info shift - echo "# Starting full workflow..." + # Extract --sync-code value from args (default: qm) + sync_code="qm" + prev_arg="" + for arg in "$@"; do + if [[ "$prev_arg" == "--sync-code" ]]; then + sync_code="$arg" + elif [[ "$arg" == "--sync-code="* ]]; then + sync_code="${arg#--sync-code=}" + fi + prev_arg="$arg" + done + echo "# Starting full workflow (sync-code: $sync_code)..." echo "+ bin/fetch_cli.py $@" bin/fetch_cli.py "$@" - echo "+ bin/convert_all.py" - bin/convert_all.py + echo "+ bin/convert_all.py --sync-code $sync_code" + bin/convert_all.py --sync-code "$sync_code" + ;; + full-all) # Execute full workflow for all Spaces + print_image_info + shift + for CODE in qm qcp; do + echo "# Starting full workflow for Space: $CODE..." + echo "+ bin/fetch_cli.py --sync-code $CODE $@" + bin/fetch_cli.py --sync-code "$CODE" "$@" + echo "+ bin/convert_all.py --sync-code $CODE" + bin/convert_all.py --sync-code "$CODE" + done ;; status) # Show detailed var/ data status report exec bin/image_status.py "${@:2}" @@ -66,7 +88,8 @@ Usage: Commands: fetch_cli.py [args...] - Collect Confluence data convert_all.py [args...] - Convert all pages to MDX - full [fetch args...] - Execute full workflow (fetch + convert) + full [fetch args...] - Execute full workflow for a single Space (default: --sync-code qm) + full-all [fetch args...] - Execute full workflow for all Spaces (qm, qcp) sequentially converter/cli.py - Convert a single XHTML to MDX status - Show var/ data freshness report bash - Run interactive shell @@ -74,8 +97,9 @@ Commands: Examples: docker run docker.io/querypie/confluence-mdx:latest full - docker run docker.io/querypie/confluence-mdx:latest full --recent - docker run docker.io/querypie/confluence-mdx:latest convert_all.py + docker run docker.io/querypie/confluence-mdx:latest full --sync-code qm --recent + docker run docker.io/querypie/confluence-mdx:latest full-all + docker run docker.io/querypie/confluence-mdx:latest convert_all.py --sync-code qm docker run docker.io/querypie/confluence-mdx:latest fetch_cli.py --attachments docker run docker.io/querypie/confluence-mdx:latest status docker run -v \$(pwd)/target:/workdir/target docker.io/querypie/confluence-mdx:latest full --local diff --git a/confluence-mdx/tests/run-tests.sh b/confluence-mdx/tests/run-tests.sh index c3b17d84b..4c2f2dd9d 100755 --- a/confluence-mdx/tests/run-tests.sh +++ b/confluence-mdx/tests/run-tests.sh @@ -121,17 +121,25 @@ activate_venv() { source "${VENV_DIR}/bin/activate" } -# Resolve page_id → slug path from pages.yaml +# Resolve page_id → slug path from pages.qm.yaml (falls back to pages.yaml for compatibility) resolve_slug_path() { local page_id="$1" python3 -c " import sys, yaml -pages = yaml.safe_load(open('${VENV_DIR}/../var/pages.yaml')) +from pathlib import Path +var_dir = Path('${VENV_DIR}/../var') +pages_file = var_dir / 'pages.qm.yaml' +if not pages_file.exists(): + pages_file = var_dir / 'pages.yaml' +if not pages_file.exists(): + print(f'ERROR: pages.qm.yaml not found in {var_dir}', file=sys.stderr) + sys.exit(1) +pages = yaml.safe_load(pages_file.open()) for p in pages: if str(p.get('page_id', '')) == sys.argv[1]: print('/' + '/'.join(p['path'])) sys.exit(0) -print(f'ERROR: page_id {sys.argv[1]} not found in pages.yaml', file=sys.stderr) +print(f'ERROR: page_id {sys.argv[1]} not found in {pages_file.name}', file=sys.stderr) sys.exit(1) " "${page_id}" } diff --git a/confluence-mdx/tests/test_reverse_sync_cli.py b/confluence-mdx/tests/test_reverse_sync_cli.py index 57f901644..32522ff7b 100644 --- a/confluence-mdx/tests/test_reverse_sync_cli.py +++ b/confluence-mdx/tests/test_reverse_sync_cli.py @@ -232,7 +232,7 @@ def test_extract_ko_mdx_path_invalid(): def test_resolve_page_id(tmp_path, monkeypatch): - """pages.yaml에서 MDX 경로로 page_id를 유도한다.""" + """pages.qm.yaml에서 MDX 경로로 page_id를 유도한다.""" import yaml monkeypatch.chdir(tmp_path) var_dir = tmp_path / "var" @@ -241,22 +241,22 @@ def test_resolve_page_id(tmp_path, monkeypatch): {'page_id': '544112828', 'path': ['user-manual', 'user-agent']}, {'page_id': '123456789', 'path': ['overview']}, ] - (var_dir / 'pages.yaml').write_text(yaml.dump(pages)) + (var_dir / 'pages.qm.yaml').write_text(yaml.dump(pages)) result = _resolve_page_id('src/content/ko/user-manual/user-agent.mdx') assert result == '544112828' def test_resolve_page_id_not_found(tmp_path, monkeypatch): - """pages.yaml에 없는 경로이면 ValueError를 발생시킨다.""" + """pages.qm.yaml에 없는 경로이면 ValueError를 발생시킨다.""" import yaml monkeypatch.chdir(tmp_path) var_dir = tmp_path / "var" var_dir.mkdir() pages = [{'page_id': '111', 'path': ['other']}] - (var_dir / 'pages.yaml').write_text(yaml.dump(pages)) + (var_dir / 'pages.qm.yaml').write_text(yaml.dump(pages)) - with pytest.raises(ValueError, match="not found in var/pages.yaml"): + with pytest.raises(ValueError, match="not found in var/pages.qm.yaml"): _resolve_page_id('src/content/ko/nonexistent/page.mdx') diff --git a/confluence-mdx/tests/test_reverse_sync_e2e.py b/confluence-mdx/tests/test_reverse_sync_e2e.py index ad730b3c9..0c24af4bb 100644 --- a/confluence-mdx/tests/test_reverse_sync_e2e.py +++ b/confluence-mdx/tests/test_reverse_sync_e2e.py @@ -124,10 +124,13 @@ def setup_var_793608206(self, tmp_path, monkeypatch): monkeypatch.chdir(tmp_path) dest = tmp_path / "var" / "793608206" shutil.copytree(VAR_DIR, dest) - # pages.yaml도 복사 (converter가 {input_dir}/../pages.yaml 을 참조) - pages_yaml = VAR_DIR.parent / "pages.yaml" - if pages_yaml.exists(): - shutil.copy2(pages_yaml, tmp_path / "var" / "pages.yaml") + # pages..yaml 복사 (converter가 {input_dir}/../pages.qm.yaml 을 참조) + # pages.qm.yaml 우선, 없으면 레거시 pages.yaml fallback + pages_yaml_src = VAR_DIR.parent / "pages.qm.yaml" + if not pages_yaml_src.exists(): + pages_yaml_src = VAR_DIR.parent / "pages.yaml" + if pages_yaml_src.exists(): + shutil.copy2(pages_yaml_src, tmp_path / "var" / "pages.qm.yaml") # _PROJECT_DIR을 tmp_path로 패치하여 run_verify가 tmp_path/var/ 를 사용하도록 함 import reverse_sync_cli monkeypatch.setattr(reverse_sync_cli, '_PROJECT_DIR', tmp_path) diff --git a/confluence-mdx/var/pages.qcp.yaml b/confluence-mdx/var/pages.qcp.yaml new file mode 100644 index 000000000..e69de29bb diff --git a/confluence-mdx/var/pages.yaml b/confluence-mdx/var/pages.qm.yaml similarity index 100% rename from confluence-mdx/var/pages.yaml rename to confluence-mdx/var/pages.qm.yaml