@@ -84,18 +84,26 @@ def process_tag(self, node, convert_as_inline, children_only=False):
8484 if not children_only and isHeading :
8585 convert_children_as_inline = True
8686
87- # Remove whitespace-only textnodes in lists
88- def is_list_node (el ):
89- return el and el .name in ['ol' , 'ul' , 'li' ]
87+ # Remove whitespace-only textnodes in purely nested nodes
88+ def is_nested_node (el ):
89+ return el and el .name in ['ol' , 'ul' , 'li' ,
90+ 'table' , 'thead' , 'tbody' , 'tfoot' ,
91+ 'tr' , 'td' , 'th' ]
9092
91- if is_list_node (node ):
93+ if is_nested_node (node ):
9294 for el in node .children :
93- # Only extract (remove) whitespace-only text node if any of the conditions is true:
95+ # Only extract (remove) whitespace-only text node if any of the
96+ # conditions is true:
9497 # - el is the first element in its parent
9598 # - el is the last element in its parent
96- # - el is adjacent to an list node
97- can_extract = not el .previous_sibling or not el .next_sibling or is_list_node (el .previous_sibling ) or is_list_node (el .next_sibling )
98- if isinstance (el , NavigableString ) and six .text_type (el ).strip () == '' and can_extract :
99+ # - el is adjacent to an nested node
100+ can_extract = (not el .previous_sibling
101+ or not el .next_sibling
102+ or is_nested_node (el .previous_sibling )
103+ or is_nested_node (el .next_sibling ))
104+ if (isinstance (el , NavigableString )
105+ and six .text_type (el ).strip () == ''
106+ and can_extract ):
99107 el .extract ()
100108
101109 # Convert the children first
@@ -277,21 +285,28 @@ def convert_img(self, el, text, convert_as_inline):
277285 return '' % (alt , src , title_part )
278286
279287 def convert_table (self , el , text , convert_as_inline ):
280- rows = el .find_all ('tr' )
281- text_data = []
282- for row in rows :
283- headers = row .find_all ('th' )
284- columns = row .find_all ('td' )
285- if len (headers ) > 0 :
286- headers = [head .text .strip () for head in headers ]
287- text_data .append ('| ' + ' | ' .join (headers ) + ' |' )
288- text_data .append ('| ' + ' | ' .join (['---' ] * len (headers )) + ' |' )
289- elif len (columns ) > 0 :
290- columns = [colm .text .strip () for colm in columns ]
291- text_data .append ('| ' + ' | ' .join (columns ) + ' |' )
292- else :
293- continue
294- return '\n ' .join (text_data )
288+ return '\n \n ' + text + '\n '
289+
290+ def convert_tr (self , el , text , convert_as_inline ):
291+ cells = el .find_all (['td' , 'th' ])
292+ is_headrow = all ([cell .name == 'th' for cell in cells ])
293+ overline = ''
294+ underline = ''
295+ if is_headrow and not el .previous_sibling :
296+ # first row and is headline: print headline underline
297+ underline += '| ' + ' | ' .join (['---' ] * len (cells )) + ' |' + '\n '
298+ elif not el .previous_sibling and not el .parent .name != 'table' :
299+ # first row, not headline, and the parent is sth. like tbody:
300+ # print empty headline above this row
301+ overline += '| ' + ' | ' .join (['' ] * len (cells )) + ' |' + '\n '
302+ overline += '| ' + ' | ' .join (['---' ] * len (cells )) + ' |' + '\n '
303+ return overline + '|' + text + '\n ' + underline
304+
305+ def convert_th (self , el , text , convert_as_inline ):
306+ return ' ' + text + ' |'
307+
308+ def convert_td (self , el , text , convert_as_inline ):
309+ return ' ' + text + ' |'
295310
296311 def convert_hr (self , el , text , convert_as_inline ):
297312 return '\n \n ---\n \n '
0 commit comments