Skip to content
This repository was archived by the owner on May 3, 2023. It is now read-only.

Commit dcaaee1

Browse files
author
Markus Konrad
committed
Merge branch 'release0.11.1' into develop
2 parents 028406a + c98881e commit dcaaee1

File tree

14 files changed

+96
-37
lines changed

14 files changed

+96
-37
lines changed

.github/stale.yml

Lines changed: 0 additions & 11 deletions
This file was deleted.

.github/workflows/runtests.yml

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -20,30 +20,32 @@ jobs:
2020
matrix:
2121
os: [ubuntu-latest, macos-latest, windows-latest]
2222
python-version: ["3.8", "3.9", "3.10"]
23+
testsuite: ["minimal", "full"]
2324
steps:
2425
- uses: actions/checkout@v2
2526
- name: set up python ${{ matrix.python-version }}
2627
uses: actions/setup-python@v2
2728
with:
2829
python-version: ${{ matrix.python-version }}
2930
cache: 'pip'
30-
- name: install system dependencies
31-
if: runner.os != 'Windows'
31+
- name: install system dependencies (linux)
32+
if: runner.os == 'Linux'
3233
# only managed to install system dependencies on Linux runners
3334
run: |
34-
if [ "$RUNNER_OS" == "Linux" ]; then
35-
sudo apt update
36-
sudo apt install libgmp-dev libmpfr-dev libmpc-dev
37-
fi
35+
sudo apt update
36+
sudo apt install libgmp-dev libmpfr-dev libmpc-dev
3837
- name: install python dependencies
3938
run: |
4039
python -m pip install --upgrade pip
4140
pip install tox
4241
- name: run tox (linux)
4342
# since system dependencies could only be installed on Linux runners, we run the "full" suite only on Linux ...
4443
if: runner.os == 'Linux'
45-
run: tox -e py-full
46-
- name: run tox (macos or windows)
47-
# ... on all other OS we run the "recommendedextra" suite
48-
if: runner.os != 'Linux'
49-
run: tox -e py-recommendedextra
44+
run: tox -e py-${{ matrix.testsuite }} -- --hypothesis-profile=ci
45+
- name: run tox (macos or windows - minimal)
46+
if: runner.os != 'Linux' && matrix.testsuite == 'minimal'
47+
run: tox -e py-minimal -- --hypothesis-profile=ci
48+
- name: run tox (macos or windows - recommendedextra)
49+
# ... on all other OS we run the "recommendedextra" suite instead of the "full" suite
50+
if: runner.os != 'Linux' && matrix.testsuite == 'full'
51+
run: tox -e py-recommendedextra -- --hypothesis-profile=ci

.github/workflows/stale.yml

Lines changed: 23 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,23 @@
1+
name: Close inactive issues
2+
on:
3+
schedule:
4+
- cron: "23 3 * * *"
5+
6+
jobs:
7+
close-issues:
8+
runs-on: ubuntu-latest
9+
permissions:
10+
issues: write
11+
pull-requests: write
12+
steps:
13+
- uses: actions/stale@v3
14+
with:
15+
days-before-issue-stale: 30
16+
days-before-issue-close: 14
17+
stale-issue-label: "stale"
18+
stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
19+
close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
20+
days-before-pr-stale: -1
21+
days-before-pr-close: -1
22+
repo-token: ${{ secrets.GITHUB_TOKEN }}
23+

conftest.py

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
"""
2+
Configuration for tests with pytest
3+
4+
.. codeauthor:: Markus Konrad <[email protected]>
5+
"""
6+
7+
from hypothesis import settings, HealthCheck
8+
9+
# profile for CI runs on GitHub machines, which may be slow from time to time so we disable the "too slow" HealthCheck
10+
# and set the timeout deadline very high (60 sec.)
11+
settings.register_profile('ci', suppress_health_check=(HealthCheck.too_slow, ), deadline=60000)

doc/source/version_history.rst

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,11 @@
33
Version history
44
===============
55

6+
0.11.1 - 2022-02-10
7+
-------------------
8+
9+
- show better error messages when dependencies for optional module ``corpus`` are not met
10+
- fix a SciPy deprecation warning
611

712
0.11.0 - 2022-02-08
813
-------------------

examples/minimal_tfidf.py

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
"""
2+
A minimal example to showcase a few features of tmtoolkit.
3+
4+
Markus Konrad <[email protected]>
5+
Feb. 2022
6+
"""
7+
8+
from tmtoolkit.corpus import Corpus, tokens_table, lemmatize, to_lowercase, dtm
9+
from tmtoolkit.bow.bow_stats import tfidf, sorted_terms_table
10+
11+
12+
# load built-in sample dataset and use 4 worker processes
13+
corp = Corpus.from_builtin_corpus('en-News100', max_workers=4)
14+
15+
# investigate corpus as dataframe
16+
toktbl = tokens_table(corp)
17+
print(toktbl)
18+
19+
# apply some text normalization
20+
lemmatize(corp)
21+
to_lowercase(corp)
22+
23+
# build sparse document-token matrix (DTM)
24+
# document labels identify rows, vocabulary tokens identify columns
25+
mat, doc_labels, vocab = dtm(corp, return_doc_labels=True, return_vocab=True)
26+
27+
# apply tf-idf transformation to DTM
28+
# operation is applied on sparse matrix and uses few memory
29+
tfidf_mat = tfidf(mat)
30+
31+
# show top 5 tokens per document ranked by tf-idf
32+
top_tokens = sorted_terms_table(tfidf_mat, vocab, doc_labels, top_n=5)
33+
print(top_tokens)

setup.py

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,7 @@
11
"""
22
tmtoolkit setuptools based setup module
3+
4+
.. codeauthor:: Markus Konrad <[email protected]>
35
"""
46

57
import os
@@ -8,7 +10,7 @@
810
from setuptools import setup, find_packages
911

1012
__title__ = 'tmtoolkit'
11-
__version__ = '0.11.1.dev'
13+
__version__ = '0.11.1'
1214
__author__ = 'Markus Konrad'
1315
__license__ = 'Apache License 2.0'
1416

tests/_testtools.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -32,7 +32,7 @@ def strategy_dtm():
3232

3333

3434
def strategy_dtm_small():
35-
return strategy_2d_array(int, 0, 10, min_side=2, max_side=10)
35+
return strategy_2d_array(int, 0, 10, min_side=2, max_side=6)
3636

3737

3838
def strategy_2d_prob_distribution():

tests/test_corpus.py

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1687,8 +1687,10 @@ def test_kwic_table_hypothesis(corpora_en_serial_and_parallel_module, **args):
16871687
else:
16881688
assert s in dkwic_tok
16891689
else:
1690-
if len(corp[lbl]) > 1:
1691-
assert all([args['glue'] in x for x in dkwic[matchattr]])
1690+
# disabled since this is not always the case: the keyword is in a very small document or at the
1691+
# start or end of a sentence, there may not be the "glue" string in the context
1692+
# if len(corp[lbl]) > 1:
1693+
# assert all([args['glue'] in x for x in dkwic[matchattr]])
16921694

16931695
if not args['inverse']:
16941696
assert all([s in x for x in dkwic[matchattr]])

tests/test_topicmod_model_stats.py

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -376,11 +376,6 @@ def test_generate_topic_labels_from_top_words(dtm, n_topics, lambda_):
376376

377377

378378
def test_filter_topics():
379-
try:
380-
import tmtoolkit.corpus
381-
except ImportError:
382-
pytest.skip("text processing dependencies not installed")
383-
384379
vocab = np.array(['abc', 'abcd', 'cde', 'efg', 'xyz'])
385380
distrib = np.array([ # top 3 terms:
386381
[0.6, 0.3, 0.05, 0.025, 0.025], # abc, abcd, cde

0 commit comments

Comments
 (0)