Merge branch 'develop'

Markus Konrad · Markus Konrad · commit ecf69d1e0df9 · 2022-02-10T14:24:34.000+01:00
diff --git a/.github/workflows/runtests.yml b/.github/workflows/runtests.yml
@@ -20,30 +20,32 @@ jobs:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
         python-version: ["3.8", "3.9", "3.10"]
+        testsuite: ["minimal", "full"]
     steps:
       - uses: actions/checkout@v2
       - name: set up python ${{ matrix.python-version }}
         uses: actions/setup-python@v2
         with:
           python-version: ${{ matrix.python-version }}
           cache: 'pip'
-      - name: install system dependencies
-        if: runner.os != 'Windows'
+      - name: install system dependencies (linux)
+        if: runner.os == 'Linux'
         # only managed to install system dependencies on Linux runners
         run: |
-          if [ "$RUNNER_OS" == "Linux" ]; then
-            sudo apt update
-            sudo apt install libgmp-dev libmpfr-dev libmpc-dev
-          fi
+          sudo apt update
+          sudo apt install libgmp-dev libmpfr-dev libmpc-dev
       - name: install python dependencies
         run: |
           python -m pip install --upgrade pip
           pip install tox
       - name: run tox (linux)
         # since system dependencies could only be installed on Linux runners, we run the "full" suite only on Linux ...
         if: runner.os == 'Linux'
-        run: tox -e py-full -- --hypothesis-profile=ci
-      - name: run tox (macos or windows)
-        # ... on all other OS we run the "recommendedextra" suite
-        if: runner.os != 'Linux'
+        run: tox -e py-${{ matrix.testsuite }} -- --hypothesis-profile=ci
+      - name: run tox (macos or windows - minimal)
+        if: runner.os != 'Linux' && matrix.testsuite == 'minimal'
+        run: tox -e py-minimal -- --hypothesis-profile=ci
+      - name: run tox (macos or windows - recommendedextra)
+        # ... on all other OS we run the "recommendedextra" suite instead of the "full" suite
+        if: runner.os != 'Linux' && matrix.testsuite == 'full'
         run: tox -e py-recommendedextra -- --hypothesis-profile=ci
diff --git a/README.rst b/README.rst
@@ -14,12 +14,37 @@ The documentation for tmtoolkit is available on `tmtoolkit.readthedocs.org <http
 the GitHub code repository is on
 `github.com/WZBSocialScienceCenter/tmtoolkit <https://github.com/WZBSocialScienceCenter/tmtoolkit>`_.
 
-.. note:: Since Feb 8 2022, the newest version 0.11.0 of tmtoolkit is available on PyPI. This version features a new API
-          for text processing and mining which is incompatible with prior versions. It's advisable to first read the
-          first three chapters of the `tutorial <https://tmtoolkit.readthedocs.io/en/latest/getting_started.html>`_
-          to get used to the new API. You should also re-install tmtoolkit in a new virtual environment or completely
-          remove the old version prior to upgrading. See the
-          `installation instructions <https://tmtoolkit.readthedocs.io/en/latest/install.html>`_.
+**Upgrade note:**
+
+Since Feb 8 2022, the newest version 0.11.0 of tmtoolkit is available on PyPI. This version features a new API
+for text processing and mining which is incompatible with prior versions. It's advisable to first read the
+first three chapters of the `tutorial <https://tmtoolkit.readthedocs.io/en/latest/getting_started.html>`_
+to get used to the new API. You should also re-install tmtoolkit in a new virtual environment or completely
+remove the old version prior to upgrading. See the
+`installation instructions <https://tmtoolkit.readthedocs.io/en/latest/install.html>`_.
+
+Requirements and installation
+-----------------------------
+
+**tmtoolkit works with Python 3.8 or newer (tested up to Python 3.10).**
+
+The tmtoolkit package is highly modular and tries to install as few dependencies as possible. For requirements and
+installation procedures, please have a look at the
+`installation section in the documentation <https://tmtoolkit.readthedocs.io/en/latest/install.html>`_. For short,
+the recommended way of installing tmtoolkit is to create and activate a
+`Python Virtual Environment ("venv") <https://docs.python.org/3/tutorial/venv.html>`_ and then install tmtoolkit with
+a recommended set of dependencies and a list of language models via the following:
+
+.. code-block:: text
+
+    pip install -U "tmtoolkit[recommended]"
+    # add or remove language codes in the list for installing the models that you need;
+    # don't use spaces in the list of languages
+    python -m tmtoolkit setup en,de
+
+Again, you should have a look at the detailed
+`installation instructions <https://tmtoolkit.readthedocs.io/en/latest/install.html>`_ in order to install additional
+packages that enable more features such as topic modeling.
 
 Features
 --------
@@ -93,14 +118,8 @@ Limits
 * all data must reside in memory, i.e. no streaming of large data from the hard disk (which for example
   `Gensim <https://radimrehurek.com/gensim/>`_ supports)
 
-Requirements and installation
-==============================
-
-For requirements and installation procedures, please have a look at the
-`installation section in the documentation <https://tmtoolkit.readthedocs.io/en/latest/install.html>`_.
-
 License
-=======
+-------
 
 Code licensed under `Apache License 2.0 <https://www.apache.org/licenses/LICENSE-2.0>`_.
 See `LICENSE <https://github.com/WZBSocialScienceCenter/tmtoolkit/blob/master/LICENSE>`_ file.
diff --git a/conftest.py b/conftest.py
@@ -7,4 +7,5 @@
 from hypothesis import settings, HealthCheck
 
 # profile for CI runs on GitHub machines, which may be slow from time to time so we disable the "too slow" HealthCheck
-settings.register_profile('ci', suppress_health_check=(HealthCheck.too_slow, ))
+# and set the timeout deadline very high (60 sec.)
+settings.register_profile('ci', suppress_health_check=(HealthCheck.too_slow, ), deadline=60000)
diff --git a/doc/source/install.rst b/doc/source/install.rst
@@ -68,21 +68,22 @@ on the preferred package for topic modeling:
     # you may also select several topic modeling packages
     pip install -U "tmtoolkit[recommended,lda,sklearn,gensim]"
 
-The minimal installation will only install a base set of dependencies and will only enable the modules for BoW
+The **minimal** installation will only install a base set of dependencies and will only enable the modules for BoW
 statistics, token sequence operations, topic modeling and utility functions. You can install it as follows:
 
 .. code-block:: text
 
+    # alternative installation if you only want to install a minimum set of dependencies
     pip install -U tmtoolkit
 
-.. note::
-    The tmtoolkit package is about 7MB big, because it contains some example corpora.
+.. note:: The tmtoolkit package is about 7MB big, because it contains some example corpora.
 
-After that, you should initially run tmtoolkit's setup routine. This makes sure that all required data files are
+**After that, you should initially run tmtoolkit's setup routine.** This makes sure that all required data files are
 present and downloads them if necessary. You should specify a list of languages for which language models should be
 downloaded and installed. The list of available language models corresponds with the models provided by
 `SpaCy <https://spacy.io/usage/models#languages>`_ (except for "multi-language"). You need to specify the two-letter ISO
-language code for the language models that you want to install. E.g. in order to install models for English and German:
+language code for the language models that you want to install. **Don't use spaces in the list of languages.**
+E.g. in order to install models for English and German:
 
 .. code-block:: text
 
diff --git a/doc/source/version_history.rst b/doc/source/version_history.rst
@@ -3,6 +3,11 @@
 Version history
 ===============
 
+0.11.1 - 2022-02-10
+-------------------
+
+- show better error messages when dependencies for optional module ``corpus`` are not met
+- fix a SciPy deprecation warning
 
 0.11.0 - 2022-02-08
 -------------------
diff --git a/examples/README.md b/examples/README.md
@@ -1,3 +1,6 @@
 # Examples
 
-This folder contains very few examples for *tmtoolkit*. The majority of examples is available as Jupyter Notebooks as part of the [documentation](https://tmtoolkit.readthedocs.io/). You may download these notebooks from the [documentation source](https://github.com/WZBSocialScienceCenter/tmtoolkit/tree/master/doc/source) and run them on your computer.
+This folder contains very few examples for *tmtoolkit*. The majority of examples is available as Jupyter Notebooks as 
+part of the [documentation](https://tmtoolkit.readthedocs.io/). You may download these notebooks from
+the [documentation source](https://github.com/WZBSocialScienceCenter/tmtoolkit/tree/master/doc/source) and run them
+on your computer.
diff --git a/examples/benchmark_en_newsarticles.py b/examples/benchmark_en_newsarticles.py
@@ -1,6 +1,14 @@
 """
 Benchmarking script that loads and processes English language test corpus with Corpus in parallel.
 
+This examples requires that you have installed tmtoolkit with the recommended set of packages and have installed an
+English language model for spaCy:
+
+    pip install -U "tmtoolkit[recommended]"
+    python -m tmtoolkit setup en
+
+For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html
+
 To benchmark whole script with `time` from command line run:
 
     PYTHONPATH=.. /usr/bin/time -v python benchmark_en_newsarticles.py [NUMBER OF WORKERS]
diff --git a/examples/bundestag18_tfidf.py b/examples/bundestag18_tfidf.py
@@ -9,6 +9,14 @@
 
 The data for the debates comes from offenesparlament.de, see https://github.com/Datenschule/offenesparlament-data.
 
+This examples requires that you have installed tmtoolkit with the recommended set of packages and have installed a
+German language model for spaCy:
+
+    pip install -U "tmtoolkit[recommended]"
+    python -m tmtoolkit setup de
+
+For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html
+
 Markus Konrad <markus.konrad@wzb.eu>
 June 2019 / Feb. 2022
 """
diff --git a/examples/gensim_evaluation.py b/examples/gensim_evaluation.py
@@ -1,11 +1,17 @@
 """
 An example for topic modeling evaluation with gensim.
 
-Please note that this is just an example for showing how to perform Topic Model evaluation with Gensim. The
+Please note that this is just an example for showing how to perform topic model evaluation with Gensim. The
 preprocessing of the data is just done quickly and probably not the best way for the given data.
 
-**Important note for Windows users:**
-You need to wrap all of the following code in a `if __name__ == '__main__'` block (just as in `lda_evaluation.py`).
+This examples requires that you have installed tmtoolkit with the recommended set of packages plus Gensim and have
+installed a German language model for spaCy:
+
+    pip install -U "tmtoolkit[recommended,gensim]"
+    python -m tmtoolkit setup de
+
+For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html
+
 """
 
 
diff --git a/examples/topicmod_lda.py b/examples/topicmod_lda.py
@@ -2,6 +2,14 @@
 An example for topic modeling with LDA with focus on the new plotting functions in `tmtoolkit.corpus.visualize` and
 in `tmtoolkit.topicmod.visualize`.
 
+This examples requires that you have installed tmtoolkit with the recommended set of packages plus "lda" and have
+installed an English language model for spaCy:
+
+    pip install -U "tmtoolkit[recommended,lda]"
+    python -m tmtoolkit setup en
+
+For more information, see the installation instructions: https://tmtoolkit.readthedocs.io/en/latest/install.html
+
 .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
 """
 
diff --git a/setup.py b/setup.py
@@ -10,7 +10,7 @@
 from setuptools import setup, find_packages
 
 __title__ = 'tmtoolkit'
-__version__ = '0.11.0'
+__version__ = '0.11.1'
 __author__ = 'Markus Konrad'
 __license__ = 'Apache License 2.0'
 
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
@@ -5,6 +5,7 @@
 
 .. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
 """
+
 import math
 import os.path
 import random
@@ -20,8 +21,8 @@
 import pytest
 from hypothesis import given, strategies as st, settings
 
-if not find_spec('spacy'):
-    pytest.skip("skipping tmtoolkit.corpus tests (spacy not installed)", allow_module_level=True)
+if any(find_spec(pkg) is None for pkg in ('spacy', 'bidict', 'loky')):
+    pytest.skip("skipping tmtoolkit.corpus tests (required packages not installed)", allow_module_level=True)
 
 import spacy
 from spacy.tokens import Doc
diff --git a/tests/test_corpusimport.py b/tests/test_corpusimport.py
@@ -0,0 +1,23 @@
+"""
+Tests for importing optional tmtoolkit.corpus module.
+
+.. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
+"""
+
+from importlib.util import find_spec
+
+import pytest
+
+
+def test_import_corpus():
+    if any(find_spec(pkg) is None for pkg in ('spacy', 'bidict', 'loky')):
+        with pytest.raises(RuntimeError, match='^the required package'):
+            from tmtoolkit import corpus
+        with pytest.raises(RuntimeError, match='^the required package'):
+            from tmtoolkit.corpus import Corpus
+    else:
+        from tmtoolkit import corpus
+        from tmtoolkit.corpus import Corpus
+        import spacy
+        import bidict
+        import loky
diff --git a/tests/test_topicmod_model_stats.py b/tests/test_topicmod_model_stats.py
@@ -376,11 +376,6 @@ def test_generate_topic_labels_from_top_words(dtm, n_topics, lambda_):
 
 
 def test_filter_topics():
-    try:
-        import tmtoolkit.corpus
-    except ImportError:
-        pytest.skip("text processing dependencies not installed")
-
     vocab = np.array(['abc', 'abcd', 'cde', 'efg', 'xyz'])
     distrib = np.array([                  # top 3 terms:
         [0.6, 0.3, 0.05, 0.025, 0.025],   # abc, abcd, cde
diff --git a/tmtoolkit/__init__.py b/tmtoolkit/__init__.py
@@ -8,7 +8,7 @@
 import logging
 
 __title__ = 'tmtoolkit'
-__version__ = '0.11.0'
+__version__ = '0.11.1'
 __author__ = 'Markus Konrad'
 __license__ = 'Apache License 2.0'
 
@@ -19,5 +19,5 @@
 
 from . import bow, topicmod, tokenseq, types, utils
 
-if find_spec('spacy') and find_spec('globre'):
+if not any(find_spec(pkg) is None for pkg in ('spacy', 'bidict', 'loky')):
     from . import corpus
diff --git a/tmtoolkit/corpus/__init__.py b/tmtoolkit/corpus/__init__.py
@@ -10,6 +10,12 @@
 
 from importlib.util import find_spec
 
+for pkg in ('spacy', 'bidict', 'loky'):
+    if find_spec(pkg) is None:
+        raise RuntimeError(f'the required package "{pkg}" for text processing is not installed; did you install '
+                           f'tmtoolkit with "recommended" or "textproc" option? see '
+                           f'https://tmtoolkit.readthedocs.io/en/latest/install.html for further information')
+
 from ..tokenseq import strip_tags, numbertoken_to_magnitude, simplify_unicode_chars
 
 from ._common import DEFAULT_LANGUAGE_MODELS, LANGUAGE_LABELS, simplified_pos
diff --git a/tmtoolkit/corpus/_corpus.py b/tmtoolkit/corpus/_corpus.py
@@ -208,8 +208,11 @@ def __init__(self, docs: Optional[Union[Dict[str, str], Sequence[Document]]] = N
             # model meta information
             try:
                 model_info = spacy.info(language_model)
-            except RuntimeError:
-                raise ValueError(f'language model "{language_model}" cannot be loaded; are you sure it is installed?')
+            except (RuntimeError, SystemExit):
+                raise RuntimeError(f'language model "{language_model}" cannot be loaded; are you sure it is installed? '
+                                   f'see https://spacy.io/models or '
+                                   f'https://tmtoolkit.readthedocs.io/en/latest/install.html for further information '
+                                   f'on installing language models')
 
             # the default pipeline compenents for SpaCy language models – these would be loaded *and enabled* if not
             # explicitly excluded
diff --git a/tmtoolkit/topicmod/model_stats.py b/tmtoolkit/topicmod/model_stats.py
@@ -564,8 +564,6 @@ def filter_topics(search_pattern, vocab, topic_word_distrib, top_n=None, thresh=
     If `return_words_and_matches` is True, this function additionally returns a NumPy array with the top words for each
     topic and a NumPy array with the pattern matches for each topic.
 
-    .. note:: Using this function requires that you've installed tmtoolkit with the `[textproc]` option.
-
     .. seealso:: See :func:`tmtoolkit.tokenseq.token_match` for filtering options.
 
     :param search_pattern: single match pattern string or list of match pattern strings