This manual combines the official Thetis documentation with community knowledge.
"""
# Add processed page content with enhancements
for page_num, text in pdf_text.items():
html_content += f'
\n'
html_content += f'\n'
html_content += f'
\n'
# Check if this page appears to be a table of contents
if re.search(r'Table of Contents|Contents', text[:100]) or re.search(r'[0-9]+\.[0-9]+\s+.+\s+[0-9]+$', text, re.MULTILINE):
# This looks like a TOC or has section numbering with page numbers
html_content += f'
\n'
else:
# Process text with better paragraph handling
lines = text.split('\n')
current_paragraph = ""
in_list = False
for line in lines:
line = line.strip()
# Skip page headers/footers
if line.startswith("THETIS User Manual") or re.match(r'^Page \d+( of \d+)?$', line):
continue
# Check if this line looks like a section title
is_section = re.match(r'^[0-9]+(\.[0-9]+)* ', line)
# Check if line looks like part of a table/dotted line listing
is_table_line = re.search(r'\.{3,}|\s{5,}', line) and re.search(r'\d+$', line)
if not line: # Empty line - paragraph break
if current_paragraph:
html_content += f'
{html.escape(current_paragraph)}
\n'
current_paragraph = ""
elif is_section:
# This looks like a section heading
if current_paragraph:
html_content += f'
{html.escape(current_paragraph)}
\n'
current_paragraph = ""
html_content += f'
{html.escape(line)}
\n'
elif is_table_line:
# This is likely part of a TOC or table
if current_paragraph:
html_content += f'
{html.escape(current_paragraph)}
\n'
current_paragraph = ""
html_content += f'
{html.escape(line)}
\n'
else:
# Continue current paragraph
if current_paragraph:
current_paragraph += " " + line
else:
current_paragraph = line
# Don't forget the last paragraph
if current_paragraph:
html_content += f'
{html.escape(current_paragraph)}
\n'
# Add enhancements for this page if keywords match
for keyword, additions in community_enhancements.items():
if keyword.lower() in text.lower():
for title, content, source, url in additions:
html_content += f'''
'''
html_content += '
\n' # Close page-content
html_content += '