Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 3 additions & 18 deletions wiktionary_de_parser/dump_processor/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import bz2
import shutil
import subprocess
from pathlib import Path
Expand Down Expand Up @@ -61,22 +62,6 @@ def download_dump(self):
size = f.write(chunk)
bar.update(size)

@staticmethod
def decompress_dump_file(dump_path: Path) -> subprocess.Popen:
if str(dump_path).endswith(".bz2"):
decompress_command = (
"lbzcat" if shutil.which("lbzcat") is not None else "bzcat"
)
p = subprocess.Popen(
[decompress_command, str(dump_path)], stdout=subprocess.PIPE
)
if p.stdout is not None:
return p
else:
raise Exception(f"No stdout from command {decompress_command}")
else:
raise ValueError("Dump file extension is not .bz2")

@staticmethod
def process_page_data(
page_element, namespaces: dict[None, str], namespace_ids: set[int]
Expand Down Expand Up @@ -132,12 +117,12 @@ def pages(self):
0
} # see https://de.wiktionary.org/wiki/Hilfe:Namensr%C3%A4ume

with self.decompress_dump_file(self.dump_file_path) as p:
with bz2.open(self.dump_file_path) as p:
namespace_str = "http://www.mediawiki.org/xml/export-0.11/"
namespaces = {None: namespace_str}

for _, page_element in etree.iterparse(
p.stdout, tag=f"{{{namespace_str}}}page"
p, tag=f"{{{namespace_str}}}page"
):
page = self.process_page_data(
page_element, namespaces, namespace_ids
Expand Down