Merge branch 'fix-headless-tables' into develop

AlexVonB · AlexVonB · commit 967db26b3a7e · 2021-05-18T10:41:42.000+02:00
diff --git a/markdownify/__init__.py b/markdownify/__init__.py
@@ -84,18 +84,26 @@ def process_tag(self, node, convert_as_inline, children_only=False):
         if not children_only and isHeading:
             convert_children_as_inline = True
 
-        # Remove whitespace-only textnodes in lists
-        def is_list_node(el):
-            return el and el.name in ['ol', 'ul', 'li']
+        # Remove whitespace-only textnodes in purely nested nodes
+        def is_nested_node(el):
+            return el and el.name in ['ol', 'ul', 'li',
+                                      'table', 'thead', 'tbody', 'tfoot',
+                                      'tr', 'td', 'th']
 
-        if is_list_node(node):
+        if is_nested_node(node):
             for el in node.children:
-                # Only extract (remove) whitespace-only text node if any of the conditions is true:
+                # Only extract (remove) whitespace-only text node if any of the
+                # conditions is true:
                 # - el is the first element in its parent
                 # - el is the last element in its parent
-                # - el is adjacent to an list node
-                can_extract = not el.previous_sibling or not el.next_sibling or is_list_node(el.previous_sibling) or is_list_node(el.next_sibling)
-                if isinstance(el, NavigableString) and six.text_type(el).strip() == '' and can_extract:
+                # - el is adjacent to an nested node
+                can_extract = (not el.previous_sibling
+                               or not el.next_sibling
+                               or is_nested_node(el.previous_sibling)
+                               or is_nested_node(el.next_sibling))
+                if (isinstance(el, NavigableString)
+                        and six.text_type(el).strip() == ''
+                        and can_extract):
                     el.extract()
 
         # Convert the children first
@@ -277,21 +285,28 @@ def convert_img(self, el, text, convert_as_inline):
         return '![%s](%s%s)' % (alt, src, title_part)
 
     def convert_table(self, el, text, convert_as_inline):
-        rows = el.find_all('tr')
-        text_data = []
-        for row in rows:
-            headers = row.find_all('th')
-            columns = row.find_all('td')
-            if len(headers) > 0:
-                headers = [head.text.strip() for head in headers]
-                text_data.append('| ' + ' | '.join(headers) + ' |')
-                text_data.append('| ' + ' | '.join(['---'] * len(headers)) + ' |')
-            elif len(columns) > 0:
-                columns = [colm.text.strip() for colm in columns]
-                text_data.append('| ' + ' | '.join(columns) + ' |')
-            else:
-                continue
-        return '\n'.join(text_data)
+        return '\n\n' + text + '\n'
+
+    def convert_tr(self, el, text, convert_as_inline):
+        cells = el.find_all(['td', 'th'])
+        is_headrow = all([cell.name == 'th' for cell in cells])
+        overline = ''
+        underline = ''
+        if is_headrow and not el.previous_sibling:
+            # first row and is headline: print headline underline
+            underline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
+        elif not el.previous_sibling and not el.parent.name != 'table':
+            # first row, not headline, and the parent is sth. like tbody:
+            # print empty headline above this row
+            overline += '| ' + ' | '.join([''] * len(cells)) + ' |' + '\n'
+            overline += '| ' + ' | '.join(['---'] * len(cells)) + ' |' + '\n'
+        return overline + '|' + text + '\n' + underline
+
+    def convert_th(self, el, text, convert_as_inline):
+        return ' ' + text + ' |'
+
+    def convert_td(self, el, text, convert_as_inline):
+        return ' ' + text + ' |'
 
     def convert_hr(self, el, text, convert_as_inline):
         return '\n\n---\n\n'
diff --git a/setup.cfg b/setup.cfg
@@ -1,2 +1,2 @@
 [flake8]
-ignore = E501
+ignore = E501 W503
diff --git a/tests/test_conversions.py b/tests/test_conversions.py
@@ -1,5 +1,4 @@
 from markdownify import markdownify as md, ATX, ATX_CLOSED, BACKSLASH, UNDERSCORE
-import re
 
 
 nested_uls = """
@@ -41,8 +40,7 @@
     </ul>"""
 
 
-table = re.sub(r'\s+', '', """
-<table>
+table = """<table>
     <tr>
         <th>Firstname</th>
         <th>Lastname</th>
@@ -58,18 +56,54 @@
         <td>Jackson</td>
         <td>94</td>
     </tr>
-</table>
-""")
+</table>"""
 
 
-table_head_body = re.sub(r'\s+', '', """
-<table>
+table_with_html_content = """<table>
+    <tr>
+        <th>Firstname</th>
+        <th>Lastname</th>
+        <th>Age</th>
+    </tr>
+    <tr>
+        <td><b>Jill</b></td>
+        <td><i>Smith</i></td>
+        <td><a href="#">50</a></td>
+    </tr>
+    <tr>
+        <td>Eve</td>
+        <td>Jackson</td>
+        <td>94</td>
+    </tr>
+</table>"""
+
+
+table_with_header_column = """<table>
+    <tr>
+        <th>Firstname</th>
+        <th>Lastname</th>
+        <th>Age</th>
+    </tr>
+    <tr>
+        <th>Jill</th>
+        <td>Smith</td>
+        <td>50</td>
+    </tr>
+    <tr>
+        <th>Eve</th>
+        <td>Jackson</td>
+        <td>94</td>
+    </tr>
+</table>"""
+
+
+table_head_body = """<table>
     <thead>
-            <tr>
+        <tr>
             <th>Firstname</th>
             <th>Lastname</th>
             <th>Age</th>
-            </tr>
+        </tr>
     </thead>
     <tbody>
         <tr>
@@ -83,17 +117,15 @@
             <td>94</td>
         </tr>
     </tbody>
-</table>
-""")
+</table>"""
 
-table_missing_text = re.sub(r'\s+', '', """
-<table>
+table_missing_text = """<table>
     <thead>
-            <tr>
+        <tr>
             <th></th>
             <th>Lastname</th>
             <th>Age</th>
-            </tr>
+        </tr>
     </thead>
     <tbody>
         <tr>
@@ -107,8 +139,25 @@
             <td>94</td>
         </tr>
     </tbody>
-</table>
-""")
+</table>"""
+
+table_missing_head = """<table>
+    <tr>
+        <td>Firstname</td>
+        <td>Lastname</td>
+        <td>Age</td>
+    </tr>
+    <tr>
+        <td>Jill</td>
+        <td>Smith</td>
+        <td>50</td>
+    </tr>
+    <tr>
+        <td>Eve</td>
+        <td>Jackson</td>
+        <td>94</td>
+    </tr>
+</table>"""
 
 
 def test_chomp():
@@ -322,9 +371,12 @@ def test_div():
 
 
 def test_table():
-    assert md(table) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |'
-    assert md(table_head_body) == '| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |'
-    assert md(table_missing_text) == '|  | Lastname | Age |\n| --- | --- | --- |\n| Jill |  | 50 |\n| Eve | Jackson | 94 |'
+    assert md(table) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_with_html_content) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| **Jill** | *Smith* | [50](#) |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_with_header_column) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_head_body) == '\n\n| Firstname | Lastname | Age |\n| --- | --- | --- |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_missing_text) == '\n\n|  | Lastname | Age |\n| --- | --- | --- |\n| Jill |  | 50 |\n| Eve | Jackson | 94 |\n\n'
+    assert md(table_missing_head) == '\n\n|  |  |  |\n| --- | --- | --- |\n| Firstname | Lastname | Age |\n| Jill | Smith | 50 |\n| Eve | Jackson | 94 |\n\n'
 
 
 def test_strong_em_symbol():

Original file line number	Diff line number	Diff line change
`@@ -1,2 +1,2 @@`
`1`	`1`	`[flake8]`
`2`		`-ignore = E501`
	`2`	`+ignore = E501 W503`