Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

import19: Add procs and limitmb options to increase performance #1784

Merged
merged 1 commit into from
Oct 24, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 8 additions & 0 deletions docs/admin/upgrade.rst
Original file line number Diff line number Diff line change
Expand Up @@ -120,6 +120,14 @@ home pages and subpages will be converted to the "users" directory. The data fro
directory will be converted to the "userprofiles" directory. The "userprofiles" directory
contains data used internally and should always be protected from any access by ACLs.

If you are importing a large wiki with more than 1000 entries or revisions, the index building
part of the import will be time-consuming. You can use the following options to speed up the process::

--procs <number of processors> --limitmb <memory in mb for each process>

Choose the values according to your available hardware resources. The defaults are 1 process and 256 mb memory.
See the `Whoosh Tips for speeding up batch indexing docs <https://whoosh.readthedocs.io/en/latest/batch.html>`_ for details.

Testing
-------
Review the logs for error messages. Start the moin server and try the "Index" and "History"
Expand Down
7 changes: 5 additions & 2 deletions src/moin/cli/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,16 +35,19 @@ def get_backends(backends: Optional[str], all_backends: bool) -> set[Backend]:
return set()


def drop_and_recreate_index(indexer):
def drop_and_recreate_index(indexer, procs=None, limitmb=None, multisegment=False):
"""Drop index and recreate, rebuild and optimize
:param indexer: IndexingMiddleware object
:param procs: Number of processors the writer will use.
:param limitmb: Maximum memory (in megabytes) each index-writer will use for the indexing pool
"""
indexer.close()
indexer.destroy()
logging.debug("Create index")
indexer.create()
logging.debug("Rebuild index")
indexer.rebuild()
# the use of multisegment leads to one index segment per process, the optimize step merges them later
indexer.rebuild(procs=procs, limitmb=limitmb, multisegment=multisegment)
logging.debug("Optimize index")
indexer.optimize_index()
indexer.open()
Expand Down
4 changes: 2 additions & 2 deletions src/moin/cli/maint/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -68,13 +68,13 @@ def IndexDestroy(tmp):


@cli.command("index-build", help="Build the indexes")
@click.option("--procs", "-p", required=False, type=int, default=1, help="Number of processors the writer will use.")
@click.option("--procs", "-p", required=False, type=int, default=None, help="Number of processors the writer will use.")
@click.option(
"--limitmb",
"-l",
required=False,
type=int,
default=10,
default=None,
help="Maximum memory (in megabytes) each index-writer will use for the indexing pool.",
)
@click.option("--tmp", is_flag=True, required=False, default=False, help="use the temporary location.")
Expand Down
13 changes: 11 additions & 2 deletions src/moin/cli/migration/moin19/import19.py
Original file line number Diff line number Diff line change
Expand Up @@ -142,7 +142,16 @@ def migr_statistics(unknown_macros):
default=NAMESPACE_DEFAULT,
help="target namespace, e.g. used for members of a wikifarm.",
)
def ImportMoin19(data_dir=None, markup_out=None, namespace=None):
@click.option("--procs", "-p", required=False, type=int, default=1, help="Number of processors the writer will use.")
@click.option(
"--limitmb",
"-l",
required=False,
type=int,
default=256,
help="Maximum memory (in megabytes) each index-writer will use for the indexing pool.",
)
def ImportMoin19(data_dir=None, markup_out=None, namespace=None, procs=None, limitmb=None):
"""Import content and user data from a moin wiki with version 1.9"""

target_namespace = namespace
Expand Down Expand Up @@ -263,7 +272,7 @@ def ImportMoin19(data_dir=None, markup_out=None, namespace=None):
backend.store(meta, out)

logging.info("PHASE4: Rebuilding the index ...")
drop_and_recreate_index(app.storage)
drop_and_recreate_index(app.storage, procs=procs, limitmb=limitmb, multisegment=True)

logging.info("Finished conversion!")
if hasattr(conv_out, "unknown_macro_list"):
Expand Down
31 changes: 26 additions & 5 deletions src/moin/storage/middleware/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -638,14 +638,19 @@ def remove_revision(self, revid, async_=True):
# this is no revision left in this item that could be the new "latest rev", just kill the rev
writer.delete_document(docnum_remove)

def _modify_index(self, index, schema, wikiname, revids, mode="add", procs=1, limitmb=256):
def _modify_index(self, index, schema, wikiname, revids, mode="add", procs=None, limitmb=None, multisegment=False):
"""
modify index contents - add, update, delete the indexed documents for all given revids

Note: mode == 'add' is faster but you need to make sure to not create duplicate
documents in the index.
"""
with index.writer(procs=procs, limitmb=limitmb) as writer:
if procs is None:
procs = 1
if limitmb is None:
limitmb = 256
logging.info(f"Using options procs={procs}, limitmb={limitmb}, multisegment={multisegment}")
with index.writer(procs=procs, limitmb=limitmb, multisegment=multisegment) as writer:
for backend_name, revid in revids:
if mode in ["add", "update"]:
meta, data = self.backend.retrieve(backend_name, revid)
Expand Down Expand Up @@ -680,7 +685,7 @@ def _find_latest_backends_revids(self, index, query=None):
]
return latest_backends_revids

def rebuild(self, tmp=False, procs=1, limitmb=256):
def rebuild(self, tmp=False, procs=None, limitmb=None, multisegment=False):
"""
Add all items/revisions from the backends of this wiki to the index
(which is expected to have no items/revisions from this wiki yet).
Expand All @@ -694,7 +699,16 @@ def rebuild(self, tmp=False, procs=1, limitmb=256):
try:
# build an index of all we have (so we know what we have)
all_revids = self.backend # the backend is an iterator over all revids
self._modify_index(index, self.schemas[ALL_REVS], self.wikiname, all_revids, "add", procs, limitmb)
self._modify_index(
index,
self.schemas[ALL_REVS],
self.wikiname,
all_revids,
"add",
procs=procs,
limitmb=limitmb,
multisegment=multisegment,
)
latest_backends_revids = self._find_latest_backends_revids(index)
finally:
index.close()
Expand All @@ -703,7 +717,14 @@ def rebuild(self, tmp=False, procs=1, limitmb=256):
index = storage.open_index(LATEST_REVS)
try:
self._modify_index(
index, self.schemas[LATEST_REVS], self.wikiname, latest_backends_revids, "add", procs, limitmb
index,
self.schemas[LATEST_REVS],
self.wikiname,
latest_backends_revids,
"add",
procs=procs,
limitmb=limitmb,
multisegment=multisegment,
)
finally:
index.close()
Expand Down