diff --git a/.gitignore b/.gitignore index 01cc7985..f433f9b4 100644 --- a/.gitignore +++ b/.gitignore @@ -163,3 +163,5 @@ objdump* TODO experimental_mods search_results +gg.docx +unstructured_reader.py diff --git a/Dockerfile b/Dockerfile index 57646eaa..c54dcc79 100644 --- a/Dockerfile +++ b/Dockerfile @@ -11,7 +11,7 @@ RUN echo '[global]' > /etc/pip.conf && \ echo 'index-url = https://mirrors.aliyun.com/pypi/simple/' >> /etc/pip.conf && \ echo 'trusted-host = mirrors.aliyun.com' >> /etc/pip.conf -# 语音输出功能(以下1,2行更换阿里源,第3,4行安装ffmpeg,都可以删除) +# 语音输出功能(以下1,2行更换阿里源,第3,4行安装ffmpeg,都可以删除) RUN sed -i 's/deb.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources && \ sed -i 's/security.debian.org/mirrors.aliyun.com/g' /etc/apt/sources.list.d/debian.sources && \ apt-get update @@ -34,5 +34,7 @@ RUN uv venv --python=3.12 && uv pip install -r requirements.txt -i https://mirro # # 非必要步骤,用于预热模块(可以删除) RUN python -c 'from check_proxy import warm_up_modules; warm_up_modules()' +ENV CGO_ENABLED=0 + # 启动(必要) CMD ["bash", "-c", "python main.py"] diff --git a/check_proxy.py b/check_proxy.py index d9da0e85..0acebbbc 100644 --- a/check_proxy.py +++ b/check_proxy.py @@ -254,13 +254,20 @@ def try_warm_up_vectordb(): nltk.data.path.append(target) try: # 尝试加载 punkt + logger.info(f'nltk模块预热') nltk.data.find('tokenizers/punkt') + nltk.data.find('tokenizers/punkt_tab') + nltk.data.find('taggers/averaged_perceptron_tagger_eng') logger.info('nltk模块预热完成(读取本地缓存)') except: # 如果找不到,则尝试下载 try: logger.info(f'模块预热: nltk punkt (从 Github 下载部分文件到 {target})') - nltk.download('punkt', download_dir=target) + from shared_utils.nltk_downloader import Downloader + _downloader = Downloader() + _downloader.download('punkt', download_dir=target) + _downloader.download('punkt_tab', download_dir=target) + _downloader.download('averaged_perceptron_tagger_eng', download_dir=target) logger.info('nltk模块预热完成') except Exception: logger.exception('模块预热: nltk punkt 失败,可能需要手动安装 nltk punkt') diff --git a/shared_utils/nltk_downloader.py b/shared_utils/nltk_downloader.py new file mode 100644 index 00000000..05e16d77 --- /dev/null +++ b/shared_utils/nltk_downloader.py @@ -0,0 +1,2561 @@ +# Natural Language Toolkit: Corpus & Model Downloader +# +# Copyright (C) 2001-2023 NLTK Project +# Author: Edward Loper +# URL: +# For license information, see LICENSE.TXT + +""" +The NLTK corpus and module downloader. This module defines several +interfaces which can be used to download corpora, models, and other +data packages that can be used with NLTK. + +Downloading Packages +==================== +If called with no arguments, ``download()`` will display an interactive +interface which can be used to download and install new packages. +If Tkinter is available, then a graphical interface will be shown, +otherwise a simple text interface will be provided. + +Individual packages can be downloaded by calling the ``download()`` +function with a single argument, giving the package identifier for the +package that should be downloaded: + + >>> download('treebank') # doctest: +SKIP + [nltk_data] Downloading package 'treebank'... + [nltk_data] Unzipping corpora/treebank.zip. + +NLTK also provides a number of \"package collections\", consisting of +a group of related packages. To download all packages in a +colleciton, simply call ``download()`` with the collection's +identifier: + + >>> download('all-corpora') # doctest: +SKIP + [nltk_data] Downloading package 'abc'... + [nltk_data] Unzipping corpora/abc.zip. + [nltk_data] Downloading package 'alpino'... + [nltk_data] Unzipping corpora/alpino.zip. + ... + [nltk_data] Downloading package 'words'... + [nltk_data] Unzipping corpora/words.zip. + +Download Directory +================== +By default, packages are installed in either a system-wide directory +(if Python has sufficient access to write to it); or in the current +user's home directory. However, the ``download_dir`` argument may be +used to specify a different installation target, if desired. + +See ``Downloader.default_download_dir()`` for more a detailed +description of how the default download directory is chosen. + +NLTK Download Server +==================== +Before downloading any packages, the corpus and module downloader +contacts the NLTK download server, to retrieve an index file +describing the available packages. By default, this index file is +loaded from ``https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml``. +If necessary, it is possible to create a new ``Downloader`` object, +specifying a different URL for the package index file. + +Usage:: + + python nltk/downloader.py [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS + +or:: + + python -m nltk.downloader [-d DATADIR] [-q] [-f] [-k] PACKAGE_IDS +""" +# ---------------------------------------------------------------------- + +""" + + 0 1 2 3 +[label][----][label][----] +[column ][column ] + +Notes +===== +Handling data files.. Some questions: + +* Should the data files be kept zipped or unzipped? I say zipped. + +* Should the data files be kept in svn at all? Advantages: history; + automatic version numbers; 'svn up' could be used rather than the + downloader to update the corpora. Disadvantages: they're big, + which makes working from svn a bit of a pain. And we're planning + to potentially make them much bigger. I don't think we want + people to have to download 400MB corpora just to use nltk from svn. + +* Compromise: keep the data files in trunk/data rather than in + trunk/nltk. That way you can check them out in svn if you want + to; but you don't need to, and you can use the downloader instead. + +* Also: keep models in mind. When we change the code, we'd + potentially like the models to get updated. This could require a + little thought. + +* So.. let's assume we have a trunk/data directory, containing a bunch + of packages. The packages should be kept as zip files, because we + really shouldn't be editing them much (well -- we may edit models + more, but they tend to be binary-ish files anyway, where diffs + aren't that helpful). So we'll have trunk/data, with a bunch of + files like abc.zip and treebank.zip and propbank.zip. For each + package we could also have eg treebank.xml and propbank.xml, + describing the contents of the package (name, copyright, license, + etc). Collections would also have .xml files. Finally, we would + pull all these together to form a single index.xml file. Some + directory structure wouldn't hurt. So how about:: + + /trunk/data/ ....................... root of data svn + index.xml ........................ main index file + src/ ............................. python scripts + packages/ ........................ dir for packages + corpora/ ....................... zip & xml files for corpora + grammars/ ...................... zip & xml files for grammars + taggers/ ....................... zip & xml files for taggers + tokenizers/ .................... zip & xml files for tokenizers + etc. + collections/ ..................... xml files for collections + + Where the root (/trunk/data) would contain a makefile; and src/ + would contain a script to update the info.xml file. It could also + contain scripts to rebuild some of the various model files. The + script that builds index.xml should probably check that each zip + file expands entirely into a single subdir, whose name matches the + package's uid. + +Changes I need to make: + - in index: change "size" to "filesize" or "compressed-size" + - in index: add "unzipped-size" + - when checking status: check both compressed & uncompressed size. + uncompressed size is important to make sure we detect a problem + if something got partially unzipped. define new status values + to differentiate stale vs corrupt vs corruptly-uncompressed?? + (we shouldn't need to re-download the file if the zip file is ok + but it didn't get uncompressed fully.) + - add other fields to the index: author, license, copyright, contact, + etc. + +the current grammars/ package would become a single new package (eg +toy-grammars or book-grammars). + +xml file should have: + - authorship info + - license info + - copyright info + - contact info + - info about what type of data/annotation it contains? + - recommended corpus reader? + +collections can contain other collections. they can also contain +multiple package types (corpora & models). Have a single 'basics' +package that includes everything we talk about in the book? + +n.b.: there will have to be a fallback to the punkt tokenizer, in case +they didn't download that model. + +default: unzip or not? + +""" +import functools +import itertools +import os +import shutil +import subprocess +import sys +import textwrap +import threading +import time +import warnings +import zipfile +from hashlib import md5 +from xml.etree import ElementTree + +try: + TKINTER = True + from tkinter import Button, Canvas, Entry, Frame, IntVar, Label, Menu, TclError, Tk + from tkinter.messagebox import showerror + + from nltk.draw.table import Table + from nltk.draw.util import ShowText +except ImportError: + TKINTER = False + TclError = ValueError + +from urllib.error import HTTPError, URLError +from urllib.request import urlopen + +import nltk +from loguru import logger +# urllib2 = nltk.internals.import_from_stdlib('urllib2') + + +###################################################################### +# Directory entry objects (from the data server's index file) +###################################################################### + + +class Package: + """ + A directory entry for a downloadable package. These entries are + extracted from the XML index file that is downloaded by + ``Downloader``. Each package consists of a single file; but if + that file is a zip file, then it can be automatically decompressed + when the package is installed. + """ + + def __init__( + self, + id, + url, + name=None, + subdir="", + size=None, + unzipped_size=None, + checksum=None, + svn_revision=None, + copyright="Unknown", + contact="Unknown", + license="Unknown", + author="Unknown", + unzip=True, + **kw, + ): + self.id = id + """A unique identifier for this package.""" + + self.name = name or id + """A string name for this package.""" + + self.subdir = subdir + """The subdirectory where this package should be installed. + E.g., ``'corpora'`` or ``'taggers'``.""" + + self.url = url + """A URL that can be used to download this package's file.""" + + self.size = int(size) + """The filesize (in bytes) of the package file.""" + + self.unzipped_size = int(unzipped_size) + """The total filesize of the files contained in the package's + zipfile.""" + + self.checksum = checksum + """The MD-5 checksum of the package file.""" + + self.svn_revision = svn_revision + """A subversion revision number for this package.""" + + self.copyright = copyright + """Copyright holder for this package.""" + + self.contact = contact + """Name & email of the person who should be contacted with + questions about this package.""" + + self.license = license + """License information for this package.""" + + self.author = author + """Author of this package.""" + + ext = os.path.splitext(url.split("/")[-1])[1] + self.filename = os.path.join(subdir, id + ext) + """The filename that should be used for this package's file. It + is formed by joining ``self.subdir`` with ``self.id``, and + using the same extension as ``url``.""" + + self.unzip = bool(int(unzip)) # '0' or '1' + """A flag indicating whether this corpus should be unzipped by + default.""" + + # Include any other attributes provided by the XML file. + self.__dict__.update(kw) + + @staticmethod + def fromxml(xml): + if isinstance(xml, str): + xml = ElementTree.parse(xml) + for key in xml.attrib: + xml.attrib[key] = str(xml.attrib[key]) + return Package(**xml.attrib) + + def __lt__(self, other): + return self.id < other.id + + def __repr__(self): + return "" % self.id + + +class Collection: + """ + A directory entry for a collection of downloadable packages. + These entries are extracted from the XML index file that is + downloaded by ``Downloader``. + """ + + def __init__(self, id, children, name=None, **kw): + self.id = id + """A unique identifier for this collection.""" + + self.name = name or id + """A string name for this collection.""" + + self.children = children + """A list of the ``Collections`` or ``Packages`` directly + contained by this collection.""" + + self.packages = None + """A list of ``Packages`` contained by this collection or any + collections it recursively contains.""" + + # Include any other attributes provided by the XML file. + self.__dict__.update(kw) + + @staticmethod + def fromxml(xml): + if isinstance(xml, str): + xml = ElementTree.parse(xml) + for key in xml.attrib: + xml.attrib[key] = str(xml.attrib[key]) + children = [child.get("ref") for child in xml.findall("item")] + return Collection(children=children, **xml.attrib) + + def __lt__(self, other): + return self.id < other.id + + def __repr__(self): + return "" % self.id + + +###################################################################### +# Message Passing Objects +###################################################################### + + +class DownloaderMessage: + """A status message object, used by ``incr_download`` to + communicate its progress.""" + + +class StartCollectionMessage(DownloaderMessage): + """Data server has started working on a collection of packages.""" + + def __init__(self, collection): + self.collection = collection + + +class FinishCollectionMessage(DownloaderMessage): + """Data server has finished working on a collection of packages.""" + + def __init__(self, collection): + self.collection = collection + + +class StartPackageMessage(DownloaderMessage): + """Data server has started working on a package.""" + + def __init__(self, package): + self.package = package + + +class FinishPackageMessage(DownloaderMessage): + """Data server has finished working on a package.""" + + def __init__(self, package): + self.package = package + + +class StartDownloadMessage(DownloaderMessage): + """Data server has started downloading a package.""" + + def __init__(self, package): + self.package = package + + +class FinishDownloadMessage(DownloaderMessage): + """Data server has finished downloading a package.""" + + def __init__(self, package): + self.package = package + + +class StartUnzipMessage(DownloaderMessage): + """Data server has started unzipping a package.""" + + def __init__(self, package): + self.package = package + + +class FinishUnzipMessage(DownloaderMessage): + """Data server has finished unzipping a package.""" + + def __init__(self, package): + self.package = package + + +class UpToDateMessage(DownloaderMessage): + """The package download file is already up-to-date""" + + def __init__(self, package): + self.package = package + + +class StaleMessage(DownloaderMessage): + """The package download file is out-of-date or corrupt""" + + def __init__(self, package): + self.package = package + + +class ErrorMessage(DownloaderMessage): + """Data server encountered an error""" + + def __init__(self, package, message): + self.package = package + if isinstance(message, Exception): + self.message = str(message) + else: + self.message = message + + +class ProgressMessage(DownloaderMessage): + """Indicates how much progress the data server has made""" + + def __init__(self, progress): + self.progress = progress + + +class SelectDownloadDirMessage(DownloaderMessage): + """Indicates what download directory the data server is using""" + + def __init__(self, download_dir): + self.download_dir = download_dir + + +###################################################################### +# NLTK Data Server +###################################################################### + + +class Downloader: + """ + A class used to access the NLTK data server, which can be used to + download corpora and other data packages. + """ + + # ///////////////////////////////////////////////////////////////// + # Configuration + # ///////////////////////////////////////////////////////////////// + + INDEX_TIMEOUT = 60 * 60 # 1 hour + """The amount of time after which the cached copy of the data + server index will be considered 'stale,' and will be + re-downloaded.""" + + DEFAULT_URL = "https://public.agent-matrix.com/publish/nltk/index.xml" + """The default URL for the NLTK data server's index. An + alternative URL can be specified when creating a new + ``Downloader`` object.""" + + # ///////////////////////////////////////////////////////////////// + # Status Constants + # ///////////////////////////////////////////////////////////////// + + INSTALLED = "installed" + """A status string indicating that a package or collection is + installed and up-to-date.""" + NOT_INSTALLED = "not installed" + """A status string indicating that a package or collection is + not installed.""" + STALE = "out of date" + """A status string indicating that a package or collection is + corrupt or out-of-date.""" + PARTIAL = "partial" + """A status string indicating that a collection is partially + installed (i.e., only some of its packages are installed.)""" + + # ///////////////////////////////////////////////////////////////// + # Constructor + # ///////////////////////////////////////////////////////////////// + + def __init__(self, server_index_url=None, download_dir=None): + self._url = server_index_url or self.DEFAULT_URL + """The URL for the data server's index file.""" + + self._collections = {} + """Dictionary from collection identifier to ``Collection``""" + + self._packages = {} + """Dictionary from package identifier to ``Package``""" + + self._download_dir = download_dir + """The default directory to which packages will be downloaded.""" + + self._index = None + """The XML index file downloaded from the data server""" + + self._index_timestamp = None + """Time at which ``self._index`` was downloaded. If it is more + than ``INDEX_TIMEOUT`` seconds old, it will be re-downloaded.""" + + self._status_cache = {} + """Dictionary from package/collection identifier to status + string (``INSTALLED``, ``NOT_INSTALLED``, ``STALE``, or + ``PARTIAL``). Cache is used for packages only, not + collections.""" + + self._errors = None + """Flag for telling if all packages got successfully downloaded or not.""" + + # decide where we're going to save things to. + if self._download_dir is None: + self._download_dir = self.default_download_dir() + + # ///////////////////////////////////////////////////////////////// + # Information + # ///////////////////////////////////////////////////////////////// + + def list( + self, + download_dir=None, + show_packages=True, + show_collections=True, + header=True, + more_prompt=False, + skip_installed=False, + ): + lines = 0 # for more_prompt + if download_dir is None: + download_dir = self._download_dir + print("Using default data directory (%s)" % download_dir) + if header: + print("=" * (26 + len(self._url))) + print(" Data server index for <%s>" % self._url) + print("=" * (26 + len(self._url))) + lines += 3 # for more_prompt + stale = partial = False + + categories = [] + if show_packages: + categories.append("packages") + if show_collections: + categories.append("collections") + for category in categories: + print("%s:" % category.capitalize()) + lines += 1 # for more_prompt + for info in sorted(getattr(self, category)(), key=str): + status = self.status(info, download_dir) + if status == self.INSTALLED and skip_installed: + continue + if status == self.STALE: + stale = True + if status == self.PARTIAL: + partial = True + prefix = { + self.INSTALLED: "*", + self.STALE: "-", + self.PARTIAL: "P", + self.NOT_INSTALLED: " ", + }[status] + name = textwrap.fill( + "-" * 27 + (info.name or info.id), 75, subsequent_indent=27 * " " + )[27:] + print(" [{}] {} {}".format(prefix, info.id.ljust(20, "."), name)) + lines += len(name.split("\n")) # for more_prompt + if more_prompt and lines > 20: + user_input = input("Hit Enter to continue: ") + if user_input.lower() in ("x", "q"): + return + lines = 0 + print() + msg = "([*] marks installed packages" + if stale: + msg += "; [-] marks out-of-date or corrupt packages" + if partial: + msg += "; [P] marks partially installed collections" + print(textwrap.fill(msg + ")", subsequent_indent=" ", width=76)) + + def packages(self): + self._update_index() + return self._packages.values() + + def corpora(self): + self._update_index() + return [pkg for (id, pkg) in self._packages.items() if pkg.subdir == "corpora"] + + def models(self): + self._update_index() + return [pkg for (id, pkg) in self._packages.items() if pkg.subdir != "corpora"] + + def collections(self): + self._update_index() + return self._collections.values() + + # ///////////////////////////////////////////////////////////////// + # Downloading + # ///////////////////////////////////////////////////////////////// + + def _info_or_id(self, info_or_id): + if isinstance(info_or_id, str): + return self.info(info_or_id) + else: + return info_or_id + + # [xx] When during downloading is it 'safe' to abort? Only unsafe + # time is *during* an unzip -- we don't want to leave a + # partially-unzipped corpus in place because we wouldn't notice + # it. But if we had the exact total size of the unzipped corpus, + # then that would be fine. Then we could abort anytime we want! + # So this is really what we should do. That way the threaded + # downloader in the gui can just kill the download thread anytime + # it wants. + + def incr_download(self, info_or_id, download_dir=None, force=False): + # If they didn't specify a download_dir, then use the default one. + if download_dir is None: + download_dir = self._download_dir + yield SelectDownloadDirMessage(download_dir) + + # If they gave us a list of ids, then download each one. + if isinstance(info_or_id, (list, tuple)): + yield from self._download_list(info_or_id, download_dir, force) + return + + # Look up the requested collection or package. + try: + info = self._info_or_id(info_or_id) + except (OSError, ValueError) as e: + yield ErrorMessage(None, f"Error loading {info_or_id}: {e}") + return + + # Handle collections. + if isinstance(info, Collection): + yield StartCollectionMessage(info) + yield from self.incr_download(info.children, download_dir, force) + yield FinishCollectionMessage(info) + + # Handle Packages (delegate to a helper function). + else: + yield from self._download_package(info, download_dir, force) + + def _num_packages(self, item): + if isinstance(item, Package): + return 1 + else: + return len(item.packages) + + def _download_list(self, items, download_dir, force): + # Look up the requested items. + for i in range(len(items)): + try: + items[i] = self._info_or_id(items[i]) + except (OSError, ValueError) as e: + yield ErrorMessage(items[i], e) + return + + # Download each item, re-scaling their progress. + num_packages = sum(self._num_packages(item) for item in items) + progress = 0 + for i, item in enumerate(items): + if isinstance(item, Package): + delta = 1.0 / num_packages + else: + delta = len(item.packages) / num_packages + for msg in self.incr_download(item, download_dir, force): + if isinstance(msg, ProgressMessage): + yield ProgressMessage(progress + msg.progress * delta) + else: + yield msg + + progress += 100 * delta + + def _download_package(self, info, download_dir, force): + yield StartPackageMessage(info) + yield ProgressMessage(0) + + # Do we already have the current version? + status = self.status(info, download_dir) + if not force and status == self.INSTALLED: + yield UpToDateMessage(info) + yield ProgressMessage(100) + yield FinishPackageMessage(info) + return + + # Remove the package from our status cache + self._status_cache.pop(info.id, None) + + # Check for (and remove) any old/stale version. + filepath = os.path.join(download_dir, info.filename) + if os.path.exists(filepath): + if status == self.STALE: + yield StaleMessage(info) + os.remove(filepath) + + # Ensure the download_dir exists + if not os.path.exists(download_dir): + os.makedirs(download_dir) + if not os.path.exists(os.path.join(download_dir, info.subdir)): + os.makedirs(os.path.join(download_dir, info.subdir)) + + # Download the file. This will raise an IOError if the url + # is not found. + yield StartDownloadMessage(info) + yield ProgressMessage(5) + try: + # logger.info('+++====' + info.url) + infile = urlopen(info.url) + with open(filepath, "wb") as outfile: + num_blocks = max(1, info.size / (1024 * 16)) + for block in itertools.count(): + s = infile.read(1024 * 16) # 16k blocks. + outfile.write(s) + if not s: + break + if block % 2 == 0: # how often? + yield ProgressMessage(min(80, 5 + 75 * (block / num_blocks))) + infile.close() + except OSError as e: + yield ErrorMessage( + info, + "Error downloading %r from <%s>:" "\n %s" % (info.id, info.url, e), + ) + return + yield FinishDownloadMessage(info) + yield ProgressMessage(80) + + # If it's a zipfile, uncompress it. + if info.filename.endswith(".zip"): + zipdir = os.path.join(download_dir, info.subdir) + # Unzip if we're unzipping by default; *or* if it's already + # been unzipped (presumably a previous version). + if info.unzip or os.path.exists(os.path.join(zipdir, info.id)): + yield StartUnzipMessage(info) + for msg in _unzip_iter(filepath, zipdir, verbose=False): + # Somewhat of a hack, but we need a proper package reference + msg.package = info + yield msg + yield FinishUnzipMessage(info) + + yield FinishPackageMessage(info) + + def download( + self, + info_or_id=None, + download_dir=None, + quiet=False, + force=False, + prefix="[nltk_data] ", + halt_on_error=True, + raise_on_error=False, + print_error_to=sys.stderr, + ): + + print_to = functools.partial(print, file=print_error_to) + # If no info or id is given, then use the interactive shell. + if info_or_id is None: + # [xx] hmm -- changing self._download_dir here seems like + # the wrong thing to do. Maybe the _interactive_download + # function should make a new copy of self to use? + if download_dir is not None: + self._download_dir = download_dir + self._interactive_download() + return True + + else: + # Define a helper function for displaying output: + def show(s, prefix2=""): + print_to( + textwrap.fill( + s, + initial_indent=prefix + prefix2, + subsequent_indent=prefix + prefix2 + " " * 4, + ) + ) + + for msg in self.incr_download(info_or_id, download_dir, force): + # Error messages + if isinstance(msg, ErrorMessage): + show(msg.message) + if raise_on_error: + raise ValueError(msg.message) + if halt_on_error: + return False + self._errors = True + if not quiet: + print_to("Error installing package. Retry? [n/y/e]") + choice = input().strip() + if choice in ["y", "Y"]: + if not self.download( + msg.package.id, + download_dir, + quiet, + force, + prefix, + halt_on_error, + raise_on_error, + ): + return False + elif choice in ["e", "E"]: + return False + + # All other messages + if not quiet: + # Collection downloading messages: + if isinstance(msg, StartCollectionMessage): + show("Downloading collection %r" % msg.collection.id) + prefix += " | " + print_to(prefix) + elif isinstance(msg, FinishCollectionMessage): + print_to(prefix) + prefix = prefix[:-4] + if self._errors: + show( + "Downloaded collection %r with errors" + % msg.collection.id + ) + else: + show("Done downloading collection %s" % msg.collection.id) + + # Package downloading messages: + elif isinstance(msg, StartPackageMessage): + show( + "Downloading package %s to %s..." + % (msg.package.id, download_dir) + ) + elif isinstance(msg, UpToDateMessage): + show("Package %s is already up-to-date!" % msg.package.id, " ") + # elif isinstance(msg, StaleMessage): + # show('Package %s is out-of-date or corrupt' % + # msg.package.id, ' ') + elif isinstance(msg, StartUnzipMessage): + show("Unzipping %s." % msg.package.filename, " ") + + # Data directory message: + elif isinstance(msg, SelectDownloadDirMessage): + download_dir = msg.download_dir + return True + + def is_stale(self, info_or_id, download_dir=None): + return self.status(info_or_id, download_dir) == self.STALE + + def is_installed(self, info_or_id, download_dir=None): + return self.status(info_or_id, download_dir) == self.INSTALLED + + def clear_status_cache(self, id=None): + if id is None: + self._status_cache.clear() + else: + self._status_cache.pop(id, None) + + def status(self, info_or_id, download_dir=None): + """ + Return a constant describing the status of the given package + or collection. Status can be one of ``INSTALLED``, + ``NOT_INSTALLED``, ``STALE``, or ``PARTIAL``. + """ + if download_dir is None: + download_dir = self._download_dir + info = self._info_or_id(info_or_id) + + # Handle collections: + if isinstance(info, Collection): + pkg_status = [self.status(pkg.id) for pkg in info.packages] + if self.STALE in pkg_status: + return self.STALE + elif self.PARTIAL in pkg_status: + return self.PARTIAL + elif self.INSTALLED in pkg_status and self.NOT_INSTALLED in pkg_status: + return self.PARTIAL + elif self.NOT_INSTALLED in pkg_status: + return self.NOT_INSTALLED + else: + return self.INSTALLED + + # Handle packages: + else: + filepath = os.path.join(download_dir, info.filename) + if download_dir != self._download_dir: + return self._pkg_status(info, filepath) + else: + if info.id not in self._status_cache: + self._status_cache[info.id] = self._pkg_status(info, filepath) + return self._status_cache[info.id] + + def _pkg_status(self, info, filepath): + if not os.path.exists(filepath): + return self.NOT_INSTALLED + + # Check if the file has the correct size. + try: + filestat = os.stat(filepath) + except OSError: + return self.NOT_INSTALLED + if filestat.st_size != int(info.size): + return self.STALE + + # Check if the file's checksum matches + if md5_hexdigest(filepath) != info.checksum: + return self.STALE + + # If it's a zipfile, and it's been at least partially + # unzipped, then check if it's been fully unzipped. + if filepath.endswith(".zip"): + unzipdir = filepath[:-4] + if not os.path.exists(unzipdir): + return self.INSTALLED # but not unzipped -- ok! + if not os.path.isdir(unzipdir): + return self.STALE + + unzipped_size = sum( + os.stat(os.path.join(d, f)).st_size + for d, _, files in os.walk(unzipdir) + for f in files + ) + if unzipped_size != info.unzipped_size: + return self.STALE + + # Otherwise, everything looks good. + return self.INSTALLED + + def update(self, quiet=False, prefix="[nltk_data] "): + """ + Re-download any packages whose status is STALE. + """ + self.clear_status_cache() + for pkg in self.packages(): + if self.status(pkg) == self.STALE: + self.download(pkg, quiet=quiet, prefix=prefix) + + # ///////////////////////////////////////////////////////////////// + # Index + # ///////////////////////////////////////////////////////////////// + + def _update_index(self, url=None): + """A helper function that ensures that self._index is + up-to-date. If the index is older than self.INDEX_TIMEOUT, + then download it again.""" + # Check if the index is already up-to-date. If so, do nothing. + if not ( + self._index is None + or url is not None + or time.time() - self._index_timestamp > self.INDEX_TIMEOUT + ): + return + + # If a URL was specified, then update our URL. + self._url = url or self._url + + # Download the index file. + # logger.info('+++====' + self._url) + self._index = nltk.internals.ElementWrapper( + ElementTree.parse(urlopen(self._url)).getroot() + ) + self._index_timestamp = time.time() + + # Build a dictionary of packages. + packages = [Package.fromxml(p) for p in self._index.findall("packages/package")] + self._packages = {p.id: p for p in packages} + + # Build a dictionary of collections. + collections = [ + Collection.fromxml(c) for c in self._index.findall("collections/collection") + ] + self._collections = {c.id: c for c in collections} + + # Replace identifiers with actual children in collection.children. + for collection in self._collections.values(): + for i, child_id in enumerate(collection.children): + if child_id in self._packages: + collection.children[i] = self._packages[child_id] + elif child_id in self._collections: + collection.children[i] = self._collections[child_id] + else: + print( + "removing collection member with no package: {}".format( + child_id + ) + ) + del collection.children[i] + + # Fill in collection.packages for each collection. + for collection in self._collections.values(): + packages = {} + queue = [collection] + for child in queue: + if isinstance(child, Collection): + queue.extend(child.children) + elif isinstance(child, Package): + packages[child.id] = child + else: + pass + collection.packages = packages.values() + + # Flush the status cache + self._status_cache.clear() + + def index(self): + """ + Return the XML index describing the packages available from + the data server. If necessary, this index will be downloaded + from the data server. + """ + self._update_index() + return self._index + + def info(self, id): + """Return the ``Package`` or ``Collection`` record for the + given item.""" + self._update_index() + if id in self._packages: + return self._packages[id] + if id in self._collections: + return self._collections[id] + raise ValueError("Package %r not found in index" % id) + + def xmlinfo(self, id): + """Return the XML info record for the given item""" + self._update_index() + for package in self._index.findall("packages/package"): + if package.get("id") == id: + return package + for collection in self._index.findall("collections/collection"): + if collection.get("id") == id: + return collection + raise ValueError("Package %r not found in index" % id) + + # ///////////////////////////////////////////////////////////////// + # URL & Data Directory + # ///////////////////////////////////////////////////////////////// + + def _get_url(self): + """The URL for the data server's index file.""" + return self._url + + def _set_url(self, url): + """ + Set a new URL for the data server. If we're unable to contact + the given url, then the original url is kept. + """ + original_url = self._url + try: + self._update_index(url) + except: + self._url = original_url + raise + + url = property(_get_url, _set_url) + + def default_download_dir(self): + """ + Return the directory to which packages will be downloaded by + default. This value can be overridden using the constructor, + or on a case-by-case basis using the ``download_dir`` argument when + calling ``download()``. + + On Windows, the default download directory is + ``PYTHONHOME/lib/nltk``, where *PYTHONHOME* is the + directory containing Python, e.g. ``C:\\Python25``. + + On all other platforms, the default directory is the first of + the following which exists or which can be created with write + permission: ``/usr/share/nltk_data``, ``/usr/local/share/nltk_data``, + ``/usr/lib/nltk_data``, ``/usr/local/lib/nltk_data``, ``~/nltk_data``. + """ + # Check if we are on GAE where we cannot write into filesystem. + if "APPENGINE_RUNTIME" in os.environ: + return + + # Check if we have sufficient permissions to install in a + # variety of system-wide locations. + for nltkdir in nltk.data.path: + if os.path.exists(nltkdir) and nltk.internals.is_writable(nltkdir): + return nltkdir + + # On Windows, use %APPDATA% + if sys.platform == "win32" and "APPDATA" in os.environ: + homedir = os.environ["APPDATA"] + + # Otherwise, install in the user's home directory. + else: + homedir = os.path.expanduser("~/") + if homedir == "~/": + raise ValueError("Could not find a default download directory") + + # append "nltk_data" to the home directory + return os.path.join(homedir, "nltk_data") + + def _get_download_dir(self): + """ + The default directory to which packages will be downloaded. + This defaults to the value returned by ``default_download_dir()``. + To override this default on a case-by-case basis, use the + ``download_dir`` argument when calling ``download()``. + """ + return self._download_dir + + def _set_download_dir(self, download_dir): + self._download_dir = download_dir + # Clear the status cache. + self._status_cache.clear() + + download_dir = property(_get_download_dir, _set_download_dir) + + # ///////////////////////////////////////////////////////////////// + # Interactive Shell + # ///////////////////////////////////////////////////////////////// + + def _interactive_download(self): + # Try the GUI first; if that doesn't work, try the simple + # interactive shell. + if TKINTER: + try: + DownloaderGUI(self).mainloop() + except TclError: + DownloaderShell(self).run() + else: + DownloaderShell(self).run() + + +class DownloaderShell: + def __init__(self, dataserver): + self._ds = dataserver + + def _simple_interactive_menu(self, *options): + print("-" * 75) + spc = (68 - sum(len(o) for o in options)) // (len(options) - 1) * " " + print(" " + spc.join(options)) + print("-" * 75) + + def run(self): + print("NLTK Downloader") + while True: + self._simple_interactive_menu( + "d) Download", + "l) List", + " u) Update", + "c) Config", + "h) Help", + "q) Quit", + ) + user_input = input("Downloader> ").strip() + if not user_input: + print() + continue + command = user_input.lower().split()[0] + args = user_input.split()[1:] + try: + if command == "l": + print() + self._ds.list(self._ds.download_dir, header=False, more_prompt=True) + elif command == "h": + self._simple_interactive_help() + elif command == "c": + self._simple_interactive_config() + elif command in ("q", "x"): + return + elif command == "d": + self._simple_interactive_download(args) + elif command == "u": + self._simple_interactive_update() + else: + print("Command %r unrecognized" % user_input) + except HTTPError as e: + print("Error reading from server: %s" % e) + except URLError as e: + print("Error connecting to server: %s" % e.reason) + # try checking if user_input is a package name, & + # downloading it? + print() + + def _simple_interactive_download(self, args): + if args: + for arg in args: + try: + self._ds.download(arg, prefix=" ") + except (OSError, ValueError) as e: + print(e) + else: + while True: + print() + print("Download which package (l=list; x=cancel)?") + user_input = input(" Identifier> ") + if user_input.lower() == "l": + self._ds.list( + self._ds.download_dir, + header=False, + more_prompt=True, + skip_installed=True, + ) + continue + elif user_input.lower() in ("x", "q", ""): + return + elif user_input: + for id in user_input.split(): + try: + self._ds.download(id, prefix=" ") + except (OSError, ValueError) as e: + print(e) + break + + def _simple_interactive_update(self): + while True: + stale_packages = [] + stale = partial = False + for info in sorted(getattr(self._ds, "packages")(), key=str): + if self._ds.status(info) == self._ds.STALE: + stale_packages.append((info.id, info.name)) + + print() + if stale_packages: + print("Will update following packages (o=ok; x=cancel)") + for pid, pname in stale_packages: + name = textwrap.fill( + "-" * 27 + (pname), 75, subsequent_indent=27 * " " + )[27:] + print(" [ ] {} {}".format(pid.ljust(20, "."), name)) + print() + + user_input = input(" Identifier> ") + if user_input.lower() == "o": + for pid, pname in stale_packages: + try: + self._ds.download(pid, prefix=" ") + except (OSError, ValueError) as e: + print(e) + break + elif user_input.lower() in ("x", "q", ""): + return + else: + print("Nothing to update.") + return + + def _simple_interactive_help(self): + print() + print("Commands:") + print( + " d) Download a package or collection u) Update out of date packages" + ) + print(" l) List packages & collections h) Help") + print(" c) View & Modify Configuration q) Quit") + + def _show_config(self): + print() + print("Data Server:") + print(" - URL: <%s>" % self._ds.url) + print(" - %d Package Collections Available" % len(self._ds.collections())) + print(" - %d Individual Packages Available" % len(self._ds.packages())) + print() + print("Local Machine:") + print(" - Data directory: %s" % self._ds.download_dir) + + def _simple_interactive_config(self): + self._show_config() + while True: + print() + self._simple_interactive_menu( + "s) Show Config", "u) Set Server URL", "d) Set Data Dir", "m) Main Menu" + ) + user_input = input("Config> ").strip().lower() + if user_input == "s": + self._show_config() + elif user_input == "d": + new_dl_dir = input(" New Directory> ").strip() + if new_dl_dir in ("", "x", "q", "X", "Q"): + print(" Cancelled!") + elif os.path.isdir(new_dl_dir): + self._ds.download_dir = new_dl_dir + else: + print("Directory %r not found! Create it first." % new_dl_dir) + elif user_input == "u": + new_url = input(" New URL> ").strip() + if new_url in ("", "x", "q", "X", "Q"): + print(" Cancelled!") + else: + if not new_url.startswith(("http://", "https://")): + new_url = "http://" + new_url + try: + self._ds.url = new_url + except Exception as e: + print(f"Error reading <{new_url!r}>:\n {e}") + elif user_input == "m": + break + + +class DownloaderGUI: + """ + Graphical interface for downloading packages from the NLTK data + server. + """ + + # ///////////////////////////////////////////////////////////////// + # Column Configuration + # ///////////////////////////////////////////////////////////////// + + COLUMNS = [ + "", + "Identifier", + "Name", + "Size", + "Status", + "Unzipped Size", + "Copyright", + "Contact", + "License", + "Author", + "Subdir", + "Checksum", + ] + """A list of the names of columns. This controls the order in + which the columns will appear. If this is edited, then + ``_package_to_columns()`` may need to be edited to match.""" + + COLUMN_WEIGHTS = {"": 0, "Name": 5, "Size": 0, "Status": 0} + """A dictionary specifying how columns should be resized when the + table is resized. Columns with weight 0 will not be resized at + all; and columns with high weight will be resized more. + Default weight (for columns not explicitly listed) is 1.""" + + COLUMN_WIDTHS = { + "": 1, + "Identifier": 20, + "Name": 45, + "Size": 10, + "Unzipped Size": 10, + "Status": 12, + } + """A dictionary specifying how wide each column should be, in + characters. The default width (for columns not explicitly + listed) is specified by ``DEFAULT_COLUMN_WIDTH``.""" + + DEFAULT_COLUMN_WIDTH = 30 + """The default width for columns that are not explicitly listed + in ``COLUMN_WIDTHS``.""" + + INITIAL_COLUMNS = ["", "Identifier", "Name", "Size", "Status"] + """The set of columns that should be displayed by default.""" + + # Perform a few import-time sanity checks to make sure that the + # column configuration variables are defined consistently: + for c in COLUMN_WEIGHTS: + assert c in COLUMNS + for c in COLUMN_WIDTHS: + assert c in COLUMNS + for c in INITIAL_COLUMNS: + assert c in COLUMNS + + # ///////////////////////////////////////////////////////////////// + # Color Configuration + # ///////////////////////////////////////////////////////////////// + + _BACKDROP_COLOR = ("#000", "#ccc") + + _ROW_COLOR = { + Downloader.INSTALLED: ("#afa", "#080"), + Downloader.PARTIAL: ("#ffa", "#880"), + Downloader.STALE: ("#faa", "#800"), + Downloader.NOT_INSTALLED: ("#fff", "#888"), + } + + _MARK_COLOR = ("#000", "#ccc") + + # _FRONT_TAB_COLOR = ('#ccf', '#008') + # _BACK_TAB_COLOR = ('#88a', '#448') + _FRONT_TAB_COLOR = ("#fff", "#45c") + _BACK_TAB_COLOR = ("#aaa", "#67a") + + _PROGRESS_COLOR = ("#f00", "#aaa") + + _TAB_FONT = "helvetica -16 bold" + + # ///////////////////////////////////////////////////////////////// + # Constructor + # ///////////////////////////////////////////////////////////////// + + def __init__(self, dataserver, use_threads=True): + self._ds = dataserver + self._use_threads = use_threads + + # For the threaded downloader: + self._download_lock = threading.Lock() + self._download_msg_queue = [] + self._download_abort_queue = [] + self._downloading = False + + # For tkinter after callbacks: + self._afterid = {} + + # A message log. + self._log_messages = [] + self._log_indent = 0 + self._log("NLTK Downloader Started!") + + # Create the main window. + top = self.top = Tk() + top.geometry("+50+50") + top.title("NLTK Downloader") + top.configure(background=self._BACKDROP_COLOR[1]) + + # Set up some bindings now, in case anything goes wrong. + top.bind("", self.destroy) + top.bind("", self.destroy) + self._destroyed = False + + self._column_vars = {} + + # Initialize the GUI. + self._init_widgets() + self._init_menu() + try: + self._fill_table() + except HTTPError as e: + showerror("Error reading from server", e) + except URLError as e: + showerror("Error connecting to server", e.reason) + + self._show_info() + self._select_columns() + self._table.select(0) + + # Make sure we get notified when we're destroyed, so we can + # cancel any download in progress. + self._table.bind("", self._destroy) + + def _log(self, msg): + self._log_messages.append( + "{} {}{}".format(time.ctime(), " | " * self._log_indent, msg) + ) + + # ///////////////////////////////////////////////////////////////// + # Internals + # ///////////////////////////////////////////////////////////////// + + def _init_widgets(self): + # Create the top-level frame structures + f1 = Frame(self.top, relief="raised", border=2, padx=8, pady=0) + f1.pack(sid="top", expand=True, fill="both") + f1.grid_rowconfigure(2, weight=1) + f1.grid_columnconfigure(0, weight=1) + Frame(f1, height=8).grid(column=0, row=0) # spacer + tabframe = Frame(f1) + tabframe.grid(column=0, row=1, sticky="news") + tableframe = Frame(f1) + tableframe.grid(column=0, row=2, sticky="news") + buttonframe = Frame(f1) + buttonframe.grid(column=0, row=3, sticky="news") + Frame(f1, height=8).grid(column=0, row=4) # spacer + infoframe = Frame(f1) + infoframe.grid(column=0, row=5, sticky="news") + Frame(f1, height=8).grid(column=0, row=6) # spacer + progressframe = Frame( + self.top, padx=3, pady=3, background=self._BACKDROP_COLOR[1] + ) + progressframe.pack(side="bottom", fill="x") + self.top["border"] = 0 + self.top["highlightthickness"] = 0 + + # Create the tabs + self._tab_names = ["Collections", "Corpora", "Models", "All Packages"] + self._tabs = {} + for i, tab in enumerate(self._tab_names): + label = Label(tabframe, text=tab, font=self._TAB_FONT) + label.pack(side="left", padx=((i + 1) % 2) * 10) + label.bind("", self._select_tab) + self._tabs[tab.lower()] = label + + # Create the table. + column_weights = [self.COLUMN_WEIGHTS.get(column, 1) for column in self.COLUMNS] + self._table = Table( + tableframe, + self.COLUMNS, + column_weights=column_weights, + highlightthickness=0, + listbox_height=16, + reprfunc=self._table_reprfunc, + ) + self._table.columnconfig(0, foreground=self._MARK_COLOR[0]) # marked + for i, column in enumerate(self.COLUMNS): + width = self.COLUMN_WIDTHS.get(column, self.DEFAULT_COLUMN_WIDTH) + self._table.columnconfig(i, width=width) + self._table.pack(expand=True, fill="both") + self._table.focus() + self._table.bind_to_listboxes("", self._download) + self._table.bind("", self._table_mark) + self._table.bind("", self._download) + self._table.bind("", self._prev_tab) + self._table.bind("", self._next_tab) + self._table.bind("", self._mark_all) + + # Create entry boxes for URL & download_dir + infoframe.grid_columnconfigure(1, weight=1) + + info = [ + ("url", "Server Index:", self._set_url), + ("download_dir", "Download Directory:", self._set_download_dir), + ] + self._info = {} + for (i, (key, label, callback)) in enumerate(info): + Label(infoframe, text=label).grid(column=0, row=i, sticky="e") + entry = Entry( + infoframe, + font="courier", + relief="groove", + disabledforeground="#007aff", + foreground="#007aff", + ) + self._info[key] = (entry, callback) + entry.bind("", self._info_save) + entry.bind("", lambda e, key=key: self._info_edit(key)) + entry.grid(column=1, row=i, sticky="ew") + + # If the user edits url or download_dir, and then clicks outside + # the entry box, then save their results. + self.top.bind("", self._info_save) + + # Create Download & Refresh buttons. + self._download_button = Button( + buttonframe, text="Download", command=self._download, width=8 + ) + self._download_button.pack(side="left") + self._refresh_button = Button( + buttonframe, text="Refresh", command=self._refresh, width=8 + ) + self._refresh_button.pack(side="right") + + # Create Progress bar + self._progresslabel = Label( + progressframe, + text="", + foreground=self._BACKDROP_COLOR[0], + background=self._BACKDROP_COLOR[1], + ) + self._progressbar = Canvas( + progressframe, + width=200, + height=16, + background=self._PROGRESS_COLOR[1], + relief="sunken", + border=1, + ) + self._init_progressbar() + self._progressbar.pack(side="right") + self._progresslabel.pack(side="left") + + def _init_menu(self): + menubar = Menu(self.top) + + filemenu = Menu(menubar, tearoff=0) + filemenu.add_command( + label="Download", underline=0, command=self._download, accelerator="Return" + ) + filemenu.add_separator() + filemenu.add_command( + label="Change Server Index", + underline=7, + command=lambda: self._info_edit("url"), + ) + filemenu.add_command( + label="Change Download Directory", + underline=0, + command=lambda: self._info_edit("download_dir"), + ) + filemenu.add_separator() + filemenu.add_command(label="Show Log", underline=5, command=self._show_log) + filemenu.add_separator() + filemenu.add_command( + label="Exit", underline=1, command=self.destroy, accelerator="Ctrl-x" + ) + menubar.add_cascade(label="File", underline=0, menu=filemenu) + + # Create a menu to control which columns of the table are + # shown. n.b.: we never hide the first two columns (mark and + # identifier). + viewmenu = Menu(menubar, tearoff=0) + for column in self._table.column_names[2:]: + var = IntVar(self.top) + assert column not in self._column_vars + self._column_vars[column] = var + if column in self.INITIAL_COLUMNS: + var.set(1) + viewmenu.add_checkbutton( + label=column, underline=0, variable=var, command=self._select_columns + ) + menubar.add_cascade(label="View", underline=0, menu=viewmenu) + + # Create a sort menu + # [xx] this should be selectbuttons; and it should include + # reversed sorts as options. + sortmenu = Menu(menubar, tearoff=0) + for column in self._table.column_names[1:]: + sortmenu.add_command( + label="Sort by %s" % column, + command=(lambda c=column: self._table.sort_by(c, "ascending")), + ) + sortmenu.add_separator() + # sortmenu.add_command(label='Descending Sort:') + for column in self._table.column_names[1:]: + sortmenu.add_command( + label="Reverse sort by %s" % column, + command=(lambda c=column: self._table.sort_by(c, "descending")), + ) + menubar.add_cascade(label="Sort", underline=0, menu=sortmenu) + + helpmenu = Menu(menubar, tearoff=0) + helpmenu.add_command(label="About", underline=0, command=self.about) + helpmenu.add_command( + label="Instructions", underline=0, command=self.help, accelerator="F1" + ) + menubar.add_cascade(label="Help", underline=0, menu=helpmenu) + self.top.bind("", self.help) + + self.top.config(menu=menubar) + + def _select_columns(self): + for (column, var) in self._column_vars.items(): + if var.get(): + self._table.show_column(column) + else: + self._table.hide_column(column) + + def _refresh(self): + self._ds.clear_status_cache() + try: + self._fill_table() + except HTTPError as e: + showerror("Error reading from server", e) + except URLError as e: + showerror("Error connecting to server", e.reason) + self._table.select(0) + + def _info_edit(self, info_key): + self._info_save() # just in case. + (entry, callback) = self._info[info_key] + entry["state"] = "normal" + entry["relief"] = "sunken" + entry.focus() + + def _info_save(self, e=None): + focus = self._table + for entry, callback in self._info.values(): + if entry["state"] == "disabled": + continue + if e is not None and e.widget is entry and e.keysym != "Return": + focus = entry + else: + entry["state"] = "disabled" + entry["relief"] = "groove" + callback(entry.get()) + focus.focus() + + def _table_reprfunc(self, row, col, val): + if self._table.column_names[col].endswith("Size"): + if isinstance(val, str): + return " %s" % val + elif val < 1024**2: + return " %.1f KB" % (val / 1024.0**1) + elif val < 1024**3: + return " %.1f MB" % (val / 1024.0**2) + else: + return " %.1f GB" % (val / 1024.0**3) + + if col in (0, ""): + return str(val) + else: + return " %s" % val + + def _set_url(self, url): + if url == self._ds.url: + return + try: + self._ds.url = url + self._fill_table() + except OSError as e: + showerror("Error Setting Server Index", str(e)) + self._show_info() + + def _set_download_dir(self, download_dir): + if self._ds.download_dir == download_dir: + return + # check if the dir exists, and if not, ask if we should create it? + + # Clear our status cache, & re-check what's installed + self._ds.download_dir = download_dir + try: + self._fill_table() + except HTTPError as e: + showerror("Error reading from server", e) + except URLError as e: + showerror("Error connecting to server", e.reason) + self._show_info() + + def _show_info(self): + print("showing info", self._ds.url) + for entry, cb in self._info.values(): + entry["state"] = "normal" + entry.delete(0, "end") + self._info["url"][0].insert(0, self._ds.url) + self._info["download_dir"][0].insert(0, self._ds.download_dir) + for entry, cb in self._info.values(): + entry["state"] = "disabled" + + def _prev_tab(self, *e): + for i, tab in enumerate(self._tab_names): + if tab.lower() == self._tab and i > 0: + self._tab = self._tab_names[i - 1].lower() + try: + return self._fill_table() + except HTTPError as e: + showerror("Error reading from server", e) + except URLError as e: + showerror("Error connecting to server", e.reason) + + def _next_tab(self, *e): + for i, tab in enumerate(self._tab_names): + if tab.lower() == self._tab and i < (len(self._tabs) - 1): + self._tab = self._tab_names[i + 1].lower() + try: + return self._fill_table() + except HTTPError as e: + showerror("Error reading from server", e) + except URLError as e: + showerror("Error connecting to server", e.reason) + + def _select_tab(self, event): + self._tab = event.widget["text"].lower() + try: + self._fill_table() + except HTTPError as e: + showerror("Error reading from server", e) + except URLError as e: + showerror("Error connecting to server", e.reason) + + _tab = "collections" + # _tab = 'corpora' + _rows = None + + def _fill_table(self): + selected_row = self._table.selected_row() + self._table.clear() + if self._tab == "all packages": + items = self._ds.packages() + elif self._tab == "corpora": + items = self._ds.corpora() + elif self._tab == "models": + items = self._ds.models() + elif self._tab == "collections": + items = self._ds.collections() + else: + assert 0, "bad tab value %r" % self._tab + rows = [self._package_to_columns(item) for item in items] + self._table.extend(rows) + + # Highlight the active tab. + for tab, label in self._tabs.items(): + if tab == self._tab: + label.configure( + foreground=self._FRONT_TAB_COLOR[0], + background=self._FRONT_TAB_COLOR[1], + ) + else: + label.configure( + foreground=self._BACK_TAB_COLOR[0], + background=self._BACK_TAB_COLOR[1], + ) + + self._table.sort_by("Identifier", order="ascending") + self._color_table() + self._table.select(selected_row) + + # This is a hack, because the scrollbar isn't updating its + # position right -- I'm not sure what the underlying cause is + # though. (This is on OS X w/ python 2.5) The length of + # delay that's necessary seems to depend on how fast the + # comptuer is. :-/ + self.top.after(150, self._table._scrollbar.set, *self._table._mlb.yview()) + self.top.after(300, self._table._scrollbar.set, *self._table._mlb.yview()) + + def _update_table_status(self): + for row_num in range(len(self._table)): + status = self._ds.status(self._table[row_num, "Identifier"]) + self._table[row_num, "Status"] = status + self._color_table() + + def _download(self, *e): + # If we're using threads, then delegate to the threaded + # downloader instead. + if self._use_threads: + return self._download_threaded(*e) + + marked = [ + self._table[row, "Identifier"] + for row in range(len(self._table)) + if self._table[row, 0] != "" + ] + selection = self._table.selected_row() + if not marked and selection is not None: + marked = [self._table[selection, "Identifier"]] + + download_iter = self._ds.incr_download(marked, self._ds.download_dir) + self._log_indent = 0 + self._download_cb(download_iter, marked) + + _DL_DELAY = 10 + + def _download_cb(self, download_iter, ids): + try: + msg = next(download_iter) + except StopIteration: + # self._fill_table(sort=False) + self._update_table_status() + afterid = self.top.after(10, self._show_progress, 0) + self._afterid["_download_cb"] = afterid + return + + def show(s): + self._progresslabel["text"] = s + self._log(s) + + if isinstance(msg, ProgressMessage): + self._show_progress(msg.progress) + elif isinstance(msg, ErrorMessage): + show(msg.message) + if msg.package is not None: + self._select(msg.package.id) + self._show_progress(None) + return # halt progress. + elif isinstance(msg, StartCollectionMessage): + show("Downloading collection %s" % msg.collection.id) + self._log_indent += 1 + elif isinstance(msg, StartPackageMessage): + show("Downloading package %s" % msg.package.id) + elif isinstance(msg, UpToDateMessage): + show("Package %s is up-to-date!" % msg.package.id) + # elif isinstance(msg, StaleMessage): + # show('Package %s is out-of-date or corrupt' % msg.package.id) + elif isinstance(msg, FinishDownloadMessage): + show("Finished downloading %r." % msg.package.id) + elif isinstance(msg, StartUnzipMessage): + show("Unzipping %s" % msg.package.filename) + elif isinstance(msg, FinishCollectionMessage): + self._log_indent -= 1 + show("Finished downloading collection %r." % msg.collection.id) + self._clear_mark(msg.collection.id) + elif isinstance(msg, FinishPackageMessage): + self._clear_mark(msg.package.id) + afterid = self.top.after(self._DL_DELAY, self._download_cb, download_iter, ids) + self._afterid["_download_cb"] = afterid + + def _select(self, id): + for row in range(len(self._table)): + if self._table[row, "Identifier"] == id: + self._table.select(row) + return + + def _color_table(self): + # Color rows according to status. + for row in range(len(self._table)): + bg, sbg = self._ROW_COLOR[self._table[row, "Status"]] + fg, sfg = ("black", "white") + self._table.rowconfig( + row, + foreground=fg, + selectforeground=sfg, + background=bg, + selectbackground=sbg, + ) + # Color the marked column + self._table.itemconfigure( + row, 0, foreground=self._MARK_COLOR[0], background=self._MARK_COLOR[1] + ) + + def _clear_mark(self, id): + for row in range(len(self._table)): + if self._table[row, "Identifier"] == id: + self._table[row, 0] = "" + + def _mark_all(self, *e): + for row in range(len(self._table)): + self._table[row, 0] = "X" + + def _table_mark(self, *e): + selection = self._table.selected_row() + if selection >= 0: + if self._table[selection][0] != "": + self._table[selection, 0] = "" + else: + self._table[selection, 0] = "X" + self._table.select(delta=1) + + def _show_log(self): + text = "\n".join(self._log_messages) + ShowText(self.top, "NLTK Downloader Log", text) + + def _package_to_columns(self, pkg): + """ + Given a package, return a list of values describing that + package, one for each column in ``self.COLUMNS``. + """ + row = [] + for column_index, column_name in enumerate(self.COLUMNS): + if column_index == 0: # Mark: + row.append("") + elif column_name == "Identifier": + row.append(pkg.id) + elif column_name == "Status": + row.append(self._ds.status(pkg)) + else: + attr = column_name.lower().replace(" ", "_") + row.append(getattr(pkg, attr, "n/a")) + return row + + # ///////////////////////////////////////////////////////////////// + # External Interface + # ///////////////////////////////////////////////////////////////// + + def destroy(self, *e): + if self._destroyed: + return + self.top.destroy() + self._destroyed = True + + def _destroy(self, *e): + if self.top is not None: + for afterid in self._afterid.values(): + self.top.after_cancel(afterid) + + # Abort any download in progress. + if self._downloading and self._use_threads: + self._abort_download() + + # Make sure the garbage collector destroys these now; + # otherwise, they may get destroyed when we're not in the main + # thread, which would make Tkinter unhappy. + self._column_vars.clear() + + def mainloop(self, *args, **kwargs): + self.top.mainloop(*args, **kwargs) + + # ///////////////////////////////////////////////////////////////// + # HELP + # ///////////////////////////////////////////////////////////////// + + HELP = textwrap.dedent( + """\ + This tool can be used to download a variety of corpora and models + that can be used with NLTK. Each corpus or model is distributed + in a single zip file, known as a \"package file.\" You can + download packages individually, or you can download pre-defined + collections of packages. + + When you download a package, it will be saved to the \"download + directory.\" A default download directory is chosen when you run + + the downloader; but you may also select a different download + directory. On Windows, the default download directory is + + + \"package.\" + + The NLTK downloader can be used to download a variety of corpora, + models, and other data packages. + + Keyboard shortcuts:: + [return]\t Download + [up]\t Select previous package + [down]\t Select next package + [left]\t Select previous tab + [right]\t Select next tab + """ + ) + + def help(self, *e): + # The default font's not very legible; try using 'fixed' instead. + try: + ShowText( + self.top, + "Help: NLTK Downloader", + self.HELP.strip(), + width=75, + font="fixed", + ) + except: + ShowText(self.top, "Help: NLTK Downloader", self.HELP.strip(), width=75) + + def about(self, *e): + ABOUT = "NLTK Downloader\n" + "Written by Edward Loper" + TITLE = "About: NLTK Downloader" + try: + from tkinter.messagebox import Message + + Message(message=ABOUT, title=TITLE).show() + except ImportError: + ShowText(self.top, TITLE, ABOUT) + + # ///////////////////////////////////////////////////////////////// + # Progress Bar + # ///////////////////////////////////////////////////////////////// + + _gradient_width = 5 + + def _init_progressbar(self): + c = self._progressbar + width, height = int(c["width"]), int(c["height"]) + for i in range(0, (int(c["width"]) * 2) // self._gradient_width): + c.create_line( + i * self._gradient_width + 20, + -20, + i * self._gradient_width - height - 20, + height + 20, + width=self._gradient_width, + fill="#%02x0000" % (80 + abs(i % 6 - 3) * 12), + ) + c.addtag_all("gradient") + c.itemconfig("gradient", state="hidden") + + # This is used to display progress + c.addtag_withtag( + "redbox", c.create_rectangle(0, 0, 0, 0, fill=self._PROGRESS_COLOR[0]) + ) + + def _show_progress(self, percent): + c = self._progressbar + if percent is None: + c.coords("redbox", 0, 0, 0, 0) + c.itemconfig("gradient", state="hidden") + else: + width, height = int(c["width"]), int(c["height"]) + x = percent * int(width) // 100 + 1 + c.coords("redbox", 0, 0, x, height + 1) + + def _progress_alive(self): + c = self._progressbar + if not self._downloading: + c.itemconfig("gradient", state="hidden") + else: + c.itemconfig("gradient", state="normal") + x1, y1, x2, y2 = c.bbox("gradient") + if x1 <= -100: + c.move("gradient", (self._gradient_width * 6) - 4, 0) + else: + c.move("gradient", -4, 0) + afterid = self.top.after(200, self._progress_alive) + self._afterid["_progress_alive"] = afterid + + # ///////////////////////////////////////////////////////////////// + # Threaded downloader + # ///////////////////////////////////////////////////////////////// + + def _download_threaded(self, *e): + # If the user tries to start a new download while we're already + # downloading something, then abort the current download instead. + if self._downloading: + self._abort_download() + return + + # Change the 'download' button to an 'abort' button. + self._download_button["text"] = "Cancel" + + marked = [ + self._table[row, "Identifier"] + for row in range(len(self._table)) + if self._table[row, 0] != "" + ] + selection = self._table.selected_row() + if not marked and selection is not None: + marked = [self._table[selection, "Identifier"]] + + # Create a new data server object for the download operation, + # just in case the user modifies our data server during the + # download (e.g., clicking 'refresh' or editing the index url). + ds = Downloader(self._ds.url, self._ds.download_dir) + + # Start downloading in a separate thread. + assert self._download_msg_queue == [] + assert self._download_abort_queue == [] + self._DownloadThread( + ds, + marked, + self._download_lock, + self._download_msg_queue, + self._download_abort_queue, + ).start() + + # Monitor the download message queue & display its progress. + self._log_indent = 0 + self._downloading = True + self._monitor_message_queue() + + # Display an indication that we're still alive and well by + # cycling the progress bar. + self._progress_alive() + + def _abort_download(self): + if self._downloading: + self._download_lock.acquire() + self._download_abort_queue.append("abort") + self._download_lock.release() + + class _DownloadThread(threading.Thread): + def __init__(self, data_server, items, lock, message_queue, abort): + self.data_server = data_server + self.items = items + self.lock = lock + self.message_queue = message_queue + self.abort = abort + threading.Thread.__init__(self) + + def run(self): + for msg in self.data_server.incr_download(self.items): + self.lock.acquire() + self.message_queue.append(msg) + # Check if we've been told to kill ourselves: + if self.abort: + self.message_queue.append("aborted") + self.lock.release() + return + self.lock.release() + self.lock.acquire() + self.message_queue.append("finished") + self.lock.release() + + _MONITOR_QUEUE_DELAY = 100 + + def _monitor_message_queue(self): + def show(s): + self._progresslabel["text"] = s + self._log(s) + + # Try to acquire the lock; if it's busy, then just try again later. + if not self._download_lock.acquire(): + return + for msg in self._download_msg_queue: + + # Done downloading? + if msg == "finished" or msg == "aborted": + # self._fill_table(sort=False) + self._update_table_status() + self._downloading = False + self._download_button["text"] = "Download" + del self._download_msg_queue[:] + del self._download_abort_queue[:] + self._download_lock.release() + if msg == "aborted": + show("Download aborted!") + self._show_progress(None) + else: + afterid = self.top.after(100, self._show_progress, None) + self._afterid["_monitor_message_queue"] = afterid + return + + # All other messages + elif isinstance(msg, ProgressMessage): + self._show_progress(msg.progress) + elif isinstance(msg, ErrorMessage): + show(msg.message) + if msg.package is not None: + self._select(msg.package.id) + self._show_progress(None) + self._downloading = False + return # halt progress. + elif isinstance(msg, StartCollectionMessage): + show("Downloading collection %r" % msg.collection.id) + self._log_indent += 1 + elif isinstance(msg, StartPackageMessage): + self._ds.clear_status_cache(msg.package.id) + show("Downloading package %r" % msg.package.id) + elif isinstance(msg, UpToDateMessage): + show("Package %s is up-to-date!" % msg.package.id) + # elif isinstance(msg, StaleMessage): + # show('Package %s is out-of-date or corrupt; updating it' % + # msg.package.id) + elif isinstance(msg, FinishDownloadMessage): + show("Finished downloading %r." % msg.package.id) + elif isinstance(msg, StartUnzipMessage): + show("Unzipping %s" % msg.package.filename) + elif isinstance(msg, FinishUnzipMessage): + show("Finished installing %s" % msg.package.id) + elif isinstance(msg, FinishCollectionMessage): + self._log_indent -= 1 + show("Finished downloading collection %r." % msg.collection.id) + self._clear_mark(msg.collection.id) + elif isinstance(msg, FinishPackageMessage): + self._update_table_status() + self._clear_mark(msg.package.id) + + # Let the user know when we're aborting a download (but + # waiting for a good point to abort it, so we don't end up + # with a partially unzipped package or anything like that). + if self._download_abort_queue: + self._progresslabel["text"] = "Aborting download..." + + # Clear the message queue and then release the lock + del self._download_msg_queue[:] + self._download_lock.release() + + # Check the queue again after MONITOR_QUEUE_DELAY msec. + afterid = self.top.after(self._MONITOR_QUEUE_DELAY, self._monitor_message_queue) + self._afterid["_monitor_message_queue"] = afterid + + +###################################################################### +# Helper Functions +###################################################################### +# [xx] It may make sense to move these to nltk.internals. + + +def md5_hexdigest(file): + """ + Calculate and return the MD5 checksum for a given file. + ``file`` may either be a filename or an open stream. + """ + if isinstance(file, str): + with open(file, "rb") as infile: + return _md5_hexdigest(infile) + return _md5_hexdigest(file) + + +def _md5_hexdigest(fp): + md5_digest = md5() + while True: + block = fp.read(1024 * 16) # 16k blocks + if not block: + break + md5_digest.update(block) + return md5_digest.hexdigest() + + +# change this to periodically yield progress messages? +# [xx] get rid of topdir parameter -- we should be checking +# this when we build the index, anyway. +def unzip(filename, root, verbose=True): + """ + Extract the contents of the zip file ``filename`` into the + directory ``root``. + """ + for message in _unzip_iter(filename, root, verbose): + if isinstance(message, ErrorMessage): + raise Exception(message) + + +def _unzip_iter(filename, root, verbose=True): + if verbose: + sys.stdout.write("Unzipping %s" % os.path.split(filename)[1]) + sys.stdout.flush() + + try: + zf = zipfile.ZipFile(filename) + except zipfile.error as e: + yield ErrorMessage(filename, "Error with downloaded zip file") + return + except Exception as e: + yield ErrorMessage(filename, e) + return + + zf.extractall(root) + + if verbose: + print() + + +###################################################################### +# Index Builder +###################################################################### +# This may move to a different file sometime. + + +def build_index(root, base_url): + """ + Create a new data.xml index file, by combining the xml description + files for various packages and collections. ``root`` should be the + path to a directory containing the package xml and zip files; and + the collection xml files. The ``root`` directory is expected to + have the following subdirectories:: + + root/ + packages/ .................. subdirectory for packages + corpora/ ................. zip & xml files for corpora + grammars/ ................ zip & xml files for grammars + taggers/ ................. zip & xml files for taggers + tokenizers/ .............. zip & xml files for tokenizers + etc. + collections/ ............... xml files for collections + + For each package, there should be two files: ``package.zip`` + (where *package* is the package name) + which contains the package itself as a compressed zip file; and + ``package.xml``, which is an xml description of the package. The + zipfile ``package.zip`` should expand to a single subdirectory + named ``package/``. The base filename ``package`` must match + the identifier given in the package's xml file. + + For each collection, there should be a single file ``collection.zip`` + describing the collection, where *collection* is the name of the collection. + + All identifiers (for both packages and collections) must be unique. + """ + # Find all packages. + packages = [] + for pkg_xml, zf, subdir in _find_packages(os.path.join(root, "packages")): + zipstat = os.stat(zf.filename) + url = f"{base_url}/{subdir}/{os.path.split(zf.filename)[1]}" + unzipped_size = sum(zf_info.file_size for zf_info in zf.infolist()) + + # Fill in several fields of the package xml with calculated values. + pkg_xml.set("unzipped_size", "%s" % unzipped_size) + pkg_xml.set("size", "%s" % zipstat.st_size) + pkg_xml.set("checksum", "%s" % md5_hexdigest(zf.filename)) + pkg_xml.set("subdir", subdir) + # pkg_xml.set('svn_revision', _svn_revision(zf.filename)) + if not pkg_xml.get("url"): + pkg_xml.set("url", url) + + # Record the package. + packages.append(pkg_xml) + + # Find all collections + collections = list(_find_collections(os.path.join(root, "collections"))) + + # Check that all UIDs are unique + uids = set() + for item in packages + collections: + if item.get("id") in uids: + raise ValueError("Duplicate UID: %s" % item.get("id")) + uids.add(item.get("id")) + + # Put it all together + top_elt = ElementTree.Element("nltk_data") + top_elt.append(ElementTree.Element("packages")) + top_elt[0].extend(sorted(packages, key=lambda package: package.get("id"))) + top_elt.append(ElementTree.Element("collections")) + top_elt[1].extend(sorted(collections, key=lambda collection: collection.get("id"))) + + _indent_xml(top_elt) + return top_elt + + +def _indent_xml(xml, prefix=""): + """ + Helper for ``build_index()``: Given an XML ``ElementTree``, modify it + (and its descendents) ``text`` and ``tail`` attributes to generate + an indented tree, where each nested element is indented by 2 + spaces with respect to its parent. + """ + if len(xml) > 0: + xml.text = (xml.text or "").strip() + "\n" + prefix + " " + for child in xml: + _indent_xml(child, prefix + " ") + for child in xml[:-1]: + child.tail = (child.tail or "").strip() + "\n" + prefix + " " + xml[-1].tail = (xml[-1].tail or "").strip() + "\n" + prefix + + +def _check_package(pkg_xml, zipfilename, zf): + """ + Helper for ``build_index()``: Perform some checks to make sure that + the given package is consistent. + """ + # The filename must patch the id given in the XML file. + uid = os.path.splitext(os.path.split(zipfilename)[1])[0] + if pkg_xml.get("id") != uid: + raise ValueError( + "package identifier mismatch ({} vs {})".format(pkg_xml.get("id"), uid) + ) + + # Zip file must expand to a subdir whose name matches uid. + if sum((name != uid and not name.startswith(uid + "/")) for name in zf.namelist()): + raise ValueError( + "Zipfile %s.zip does not expand to a single " + "subdirectory %s/" % (uid, uid) + ) + + +# update for git? +def _svn_revision(filename): + """ + Helper for ``build_index()``: Calculate the subversion revision + number for a given file (by using ``subprocess`` to run ``svn``). + """ + p = subprocess.Popen( + ["svn", "status", "-v", filename], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE, + ) + (stdout, stderr) = p.communicate() + if p.returncode != 0 or stderr or not stdout: + raise ValueError( + "Error determining svn_revision for %s: %s" + % (os.path.split(filename)[1], textwrap.fill(stderr)) + ) + return stdout.split()[2] + + +def _find_collections(root): + """ + Helper for ``build_index()``: Yield a list of ElementTree.Element + objects, each holding the xml for a single package collection. + """ + for dirname, _subdirs, files in os.walk(root): + for filename in files: + if filename.endswith(".xml"): + xmlfile = os.path.join(dirname, filename) + yield ElementTree.parse(xmlfile).getroot() + + +def _find_packages(root): + """ + Helper for ``build_index()``: Yield a list of tuples + ``(pkg_xml, zf, subdir)``, where: + - ``pkg_xml`` is an ``ElementTree.Element`` holding the xml for a + package + - ``zf`` is a ``zipfile.ZipFile`` for the package's contents. + - ``subdir`` is the subdirectory (relative to ``root``) where + the package was found (e.g. 'corpora' or 'grammars'). + """ + from nltk.corpus.reader.util import _path_from + + # Find all packages. + packages = [] + for dirname, subdirs, files in os.walk(root): + relpath = "/".join(_path_from(root, dirname)) + for filename in files: + if filename.endswith(".xml"): + xmlfilename = os.path.join(dirname, filename) + zipfilename = xmlfilename[:-4] + ".zip" + try: + zf = zipfile.ZipFile(zipfilename) + except Exception as e: + raise ValueError(f"Error reading file {zipfilename!r}!\n{e}") from e + try: + pkg_xml = ElementTree.parse(xmlfilename).getroot() + except Exception as e: + raise ValueError(f"Error reading file {xmlfilename!r}!\n{e}") from e + + # Check that the UID matches the filename + uid = os.path.split(xmlfilename[:-4])[1] + if pkg_xml.get("id") != uid: + raise ValueError( + "package identifier mismatch (%s " + "vs %s)" % (pkg_xml.get("id"), uid) + ) + + # Check that the zipfile expands to a subdir whose + # name matches the uid. + if sum( + (name != uid and not name.startswith(uid + "/")) + for name in zf.namelist() + ): + raise ValueError( + "Zipfile %s.zip does not expand to a " + "single subdirectory %s/" % (uid, uid) + ) + + yield pkg_xml, zf, relpath + + elif filename.endswith(".zip"): + # Warn user in case a .xml does not exist for a .zip + resourcename = os.path.splitext(filename)[0] + xmlfilename = os.path.join(dirname, resourcename + ".xml") + if not os.path.exists(xmlfilename): + warnings.warn( + f"{filename} exists, but {resourcename + '.xml'} cannot be found! " + f"This could mean that {resourcename} can not be downloaded.", + stacklevel=2, + ) + + # Don't recurse into svn subdirectories: + try: + subdirs.remove(".svn") + except ValueError: + pass + + +###################################################################### +# Main: +###################################################################### + +# There should be a command-line interface + +# Aliases +_downloader = Downloader() +download = _downloader.download + + +def download_shell(): + DownloaderShell(_downloader).run() + + +def download_gui(): + DownloaderGUI(_downloader).mainloop() + + +def update(): + _downloader.update() + + +if __name__ == "__main__": + from optparse import OptionParser + + parser = OptionParser() + parser.add_option( + "-d", + "--dir", + dest="dir", + help="download package to directory DIR", + metavar="DIR", + ) + parser.add_option( + "-q", + "--quiet", + dest="quiet", + action="store_true", + default=False, + help="work quietly", + ) + parser.add_option( + "-f", + "--force", + dest="force", + action="store_true", + default=False, + help="download even if already installed", + ) + parser.add_option( + "-e", + "--exit-on-error", + dest="halt_on_error", + action="store_true", + default=False, + help="exit if an error occurs", + ) + parser.add_option( + "-u", + "--url", + dest="server_index_url", + default=os.environ.get("NLTK_DOWNLOAD_URL"), + help="download server index url", + ) + + (options, args) = parser.parse_args() + + downloader = Downloader(server_index_url=options.server_index_url) + + if args: + for pkg_id in args: + rv = downloader.download( + info_or_id=pkg_id, + download_dir=options.dir, + quiet=options.quiet, + force=options.force, + halt_on_error=options.halt_on_error, + ) + if rv == False and options.halt_on_error: + break + else: + downloader.download( + download_dir=options.dir, + quiet=options.quiet, + force=options.force, + halt_on_error=options.halt_on_error, + )