Writing a Markdown to LaTeX Converter in Python

December 13, 2014 cjohnson318

In this post I’ll describe a simple (read: limited) Markdown to LaTeX converter. This will cover up to three levels of headings, ordered, and unordered lists.

Headings

This first function will determine if a line is a heading or not. It makes this decision based on the presence of symmetrically placed hashes. The second function replaces a line with a LaTeX section depending on the heading level.

import re

def heading( line, nhashes ):
    '''
    Returns True if line is a heading, otherwise False
    '''
    # if there are an even number of hashes
    if( nhashes > 0 )and( nhashes % 2 == 0 ):
        level = nhashes / 2
        expr = r"^#{{0}}.*#{{0}}$".format( level )
        # if there is some kind of content
        # centered between the hashes
        if re.match( expr, line ):
            return True
    return False

def parse_heading( line, nhashes ):
    '''
    Determine the level of the heading
    
    Return '\section', '\subsection', or '\subsubsection'
    '''
    # if not a heading, return line
    if not heading( line, nhashes ):
        return line
    # determine heading level
    level = nhashes / 2
    # grab the heading content
    content = line.split( '#'*level )
    content = [ i.strip() for i in content if i != '' ]
    content = "{"+content[0]+"}"
    # sort out the heading levels
    if level == 1:
        out = r"\section*{0}"
    elif level == 2:
        out = r"\subsection*{0}"
    elif level == 3:
        out = r"\subsubsection*{0}"
    else:
        return line
    # format the output
    return out.format( content )

Lists

This part is the most convoluted. Here, we parse sections of text that are indented and begin with a numeral, or some marker. We only support one level of indentation.

def look_for_lists( x, marker, typelist ):
    '''
    Backend for ordered and unordered lists functions.
    '''
    lines = list()
    # create a list of lists or indices that
    # have some sort of indented marker
    for i, line in enumerate( x ):
        if re.match( r'^\s+'+marker+'\s', line ):
            if( lines == [] )or( i-1 != lines[-1][-1] ):
                lines.append( [ i ] )
            else:
                lines[-1].append( i )
    # if no indented markers are found, return
    if lines == []:
        return x
    # replace the markers with \item tages
    for group in lines:
        for i, line in enumerate( group ):
            x[ line ] = re.sub( r'^\s+'+marker+'\s', r'  \item ', x[ line ] )
    # determine where to put the \begin{} and \end{} delimiters
    idx = 0
    for i, group in enumerate( lines ):
        lines[i] = [ group[0]+idx, group[-1]+idx+2 ]
        idx += 2
    lines = reduce( lambda u, v: u+v, lines )
    # insert the appropriate \begin{} and \end{} delimiters
    for i, idx in enumerate( lines ):
        if i%2 == 0:
            x.insert( idx, r'\begin{'+typelist+'}' )
        else:
            x.insert( idx, r'\end{'+typelist+'}' )
    return x

def ordered_lists( x ):
    '''
    Use a regex to find numbered lists
    '''
    return look_for_lists( x, '[0-9]+\.', 'enumerate' )

def unordered_lists( x ):
    '''
    Use a regex to find unordered lists
    '''
    return look_for_lists( x, '[-\*\+o]', 'itemize' )

The Rest

These functions wrap up the previous functions to perform the conversion operation.

def open_markdown( fn ):
    '''
    Open a file, and return a list of strings
    '''
    x = open( fn, 'r' ).readlines()
    x = [ i.rstrip() for i in x ]
    return x

def parse_document( x ):
    '''
    Parse and convert a Markdown document into LaTeX
    
    Supports three levels of headings, ordered and unordered lists
    '''
    y = list()
    # parse headings
    for line in x:
        nhashes = line.count('#')
        y.append( parse_heading( line, nhashes ) )
    # parse ordered lists
    y = ordered_lists( y )
    # parse unordered lists
    y = unordered_lists( y )
    # add some simple headers and footers
    y.insert( 0, r'\documentclass{article}' )
    y.insert( 1, r'\begin{document}' )
    y.append( r'\end{document}' )
    return y
    
def write_latex( fn, y ):
    '''
    Write a list of strings to a LaTeX file.
    '''
    base = fn.split('.md')[0]
    fh = open( base+'.tex', 'w' )
    for line in y:
        fh.write( line+"\n" )
    fh.close()

def convert( fn ):
    x = open_markdown( fn )
    y = parse_document( x )
    write_latex( fn, y )

We can then convert a markdown document to a LaTex document by passing the filename of the markdown file to the convert() function.

Blog about math, programming, and data.