Merge branch 'release0.11.1' into develop

Markus Konrad · Markus Konrad · commit dcaaee1c4299 · 2022-02-10T14:24:03.000+01:00
diff --git a/.github/stale.yml b/.github/stale.yml
diff --git a/.github/workflows/runtests.yml b/.github/workflows/runtests.yml
@@ -20,30 +20,32 @@ jobs:
       matrix:
         os: [ubuntu-latest, macos-latest, windows-latest]
         python-version: ["3.8", "3.9", "3.10"]
+        testsuite: ["minimal", "full"]
     steps:
       - uses: actions/checkout@v2
       - name: set up python ${{ matrix.python-version }}
         uses: actions/setup-python@v2
         with:
           python-version: ${{ matrix.python-version }}
           cache: 'pip'
-      - name: install system dependencies
-        if: runner.os != 'Windows'
+      - name: install system dependencies (linux)
+        if: runner.os == 'Linux'
         # only managed to install system dependencies on Linux runners
         run: |
-          if [ "$RUNNER_OS" == "Linux" ]; then
-            sudo apt update
-            sudo apt install libgmp-dev libmpfr-dev libmpc-dev
-          fi
+          sudo apt update
+          sudo apt install libgmp-dev libmpfr-dev libmpc-dev
       - name: install python dependencies
         run: |
           python -m pip install --upgrade pip
           pip install tox
       - name: run tox (linux)
         # since system dependencies could only be installed on Linux runners, we run the "full" suite only on Linux ...
         if: runner.os == 'Linux'
-        run: tox -e py-full
-      - name: run tox (macos or windows)
-        # ... on all other OS we run the "recommendedextra" suite
-        if: runner.os != 'Linux'
-        run: tox -e py-recommendedextra
+        run: tox -e py-${{ matrix.testsuite }} -- --hypothesis-profile=ci
+      - name: run tox (macos or windows - minimal)
+        if: runner.os != 'Linux' && matrix.testsuite == 'minimal'
+        run: tox -e py-minimal -- --hypothesis-profile=ci
+      - name: run tox (macos or windows - recommendedextra)
+        # ... on all other OS we run the "recommendedextra" suite instead of the "full" suite
+        if: runner.os != 'Linux' && matrix.testsuite == 'full'
+        run: tox -e py-recommendedextra -- --hypothesis-profile=ci
diff --git a/.github/workflows/stale.yml b/.github/workflows/stale.yml
@@ -0,0 +1,23 @@
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "23 3 * * *"
+
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v3
+        with:
+          days-before-issue-stale: 30
+          days-before-issue-close: 14
+          stale-issue-label: "stale"
+          stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
+          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          repo-token: ${{ secrets.GITHUB_TOKEN }}
+
diff --git a/conftest.py b/conftest.py
@@ -0,0 +1,11 @@
+"""
+Configuration for tests with pytest
+
+.. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
+"""
+
+from hypothesis import settings, HealthCheck
+
+# profile for CI runs on GitHub machines, which may be slow from time to time so we disable the "too slow" HealthCheck
+# and set the timeout deadline very high (60 sec.)
+settings.register_profile('ci', suppress_health_check=(HealthCheck.too_slow, ), deadline=60000)
diff --git a/doc/source/version_history.rst b/doc/source/version_history.rst
@@ -3,6 +3,11 @@
 Version history
 ===============
 
+0.11.1 - 2022-02-10
+-------------------
+
+- show better error messages when dependencies for optional module ``corpus`` are not met
+- fix a SciPy deprecation warning
 
 0.11.0 - 2022-02-08
 -------------------
diff --git a/examples/minimal_tfidf.py b/examples/minimal_tfidf.py
@@ -0,0 +1,33 @@
+"""
+A minimal example to showcase a few features of tmtoolkit.
+
+Markus Konrad <markus.konrad@wzb.eu>
+Feb. 2022
+"""
+
+from tmtoolkit.corpus import Corpus, tokens_table, lemmatize, to_lowercase, dtm
+from tmtoolkit.bow.bow_stats import tfidf, sorted_terms_table
+
+
+# load built-in sample dataset and use 4 worker processes
+corp = Corpus.from_builtin_corpus('en-News100', max_workers=4)
+
+# investigate corpus as dataframe
+toktbl = tokens_table(corp)
+print(toktbl)
+
+# apply some text normalization
+lemmatize(corp)
+to_lowercase(corp)
+
+# build sparse document-token matrix (DTM)
+# document labels identify rows, vocabulary tokens identify columns
+mat, doc_labels, vocab = dtm(corp, return_doc_labels=True, return_vocab=True)
+
+# apply tf-idf transformation to DTM
+# operation is applied on sparse matrix and uses few memory
+tfidf_mat = tfidf(mat)
+
+# show top 5 tokens per document ranked by tf-idf
+top_tokens = sorted_terms_table(tfidf_mat, vocab, doc_labels, top_n=5)
+print(top_tokens)
diff --git a/setup.py b/setup.py
@@ -1,5 +1,7 @@
 """
 tmtoolkit setuptools based setup module
+
+.. codeauthor:: Markus Konrad <markus.konrad@wzb.eu>
 """
 
 import os
@@ -8,7 +10,7 @@
 from setuptools import setup, find_packages
 
 __title__ = 'tmtoolkit'
-__version__ = '0.11.1.dev'
+__version__ = '0.11.1'
 __author__ = 'Markus Konrad'
 __license__ = 'Apache License 2.0'
 
diff --git a/tests/_testtools.py b/tests/_testtools.py
@@ -32,7 +32,7 @@ def strategy_dtm():
 
 
 def strategy_dtm_small():
-    return strategy_2d_array(int, 0, 10, min_side=2, max_side=10)
+    return strategy_2d_array(int, 0, 10, min_side=2, max_side=6)
 
 
 def strategy_2d_prob_distribution():
diff --git a/tests/test_corpus.py b/tests/test_corpus.py
@@ -1687,8 +1687,10 @@ def test_kwic_table_hypothesis(corpora_en_serial_and_parallel_module, **args):
                             else:
                                 assert s in dkwic_tok
                     else:
-                        if len(corp[lbl]) > 1:
-                            assert all([args['glue'] in x for x in dkwic[matchattr]])
+                        # disabled since this is not always the case: the keyword is in a very small document or at the
+                        # start or end of a sentence, there may not be the "glue" string in the context
+                        # if len(corp[lbl]) > 1:
+                        #     assert all([args['glue'] in x for x in dkwic[matchattr]])
 
                         if not args['inverse']:
                             assert all([s in x for x in dkwic[matchattr]])
diff --git a/tests/test_topicmod_model_stats.py b/tests/test_topicmod_model_stats.py
@@ -376,11 +376,6 @@ def test_generate_topic_labels_from_top_words(dtm, n_topics, lambda_):
 
 
 def test_filter_topics():
-    try:
-        import tmtoolkit.corpus
-    except ImportError:
-        pytest.skip("text processing dependencies not installed")
-
     vocab = np.array(['abc', 'abcd', 'cde', 'efg', 'xyz'])
     distrib = np.array([                  # top 3 terms:
         [0.6, 0.3, 0.05, 0.025, 0.025],   # abc, abcd, cde
diff --git a/tests/test_topicmod_visualize.py b/tests/test_topicmod_visualize.py
@@ -1,5 +1,4 @@
 import os
-import random
 
 import pytest
 from hypothesis import given, strategies as st, settings
@@ -10,7 +9,7 @@
 from ._testtools import strategy_2d_prob_distribution
 
 from tmtoolkit.utils import empty_chararray
-from tmtoolkit.topicmod import model_io, visualize, evaluate
+from tmtoolkit.topicmod import model_io, visualize
 
 
 def test_generate_wordclouds_for_topic_words():
diff --git a/tmtoolkit/__init__.py b/tmtoolkit/__init__.py
@@ -8,7 +8,7 @@
 import logging
 
 __title__ = 'tmtoolkit'
-__version__ = '0.11.1.dev'
+__version__ = '0.11.1'
 __author__ = 'Markus Konrad'
 __license__ = 'Apache License 2.0'
 
diff --git a/tmtoolkit/bow/dtm.py b/tmtoolkit/bow/dtm.py
@@ -132,7 +132,7 @@ def dtm_to_gensim_corpus(dtm):
         else:
             dtm_sparse = dtm_t
     else:
-        from scipy.sparse.csc import csc_matrix
+        from scipy.sparse import csc_matrix
         dtm_sparse = csc_matrix(dtm_t)
 
     return gensim.matutils.Sparse2Corpus(dtm_sparse)
diff --git a/tmtoolkit/topicmod/model_stats.py b/tmtoolkit/topicmod/model_stats.py
@@ -564,8 +564,6 @@ def filter_topics(search_pattern, vocab, topic_word_distrib, top_n=None, thresh=
     If `return_words_and_matches` is True, this function additionally returns a NumPy array with the top words for each
     topic and a NumPy array with the pattern matches for each topic.
 
-    .. note:: Using this function requires that you've installed tmtoolkit with the `[textproc]` option.
-
     .. seealso:: See :func:`tmtoolkit.tokenseq.token_match` for filtering options.
 
     :param search_pattern: single match pattern string or list of match pattern strings