Source code for confluencer.tools.content

# -*- coding: utf-8 -*-
# pylint: disable=bad-continuation
""" Tools to discover and modify content.
"""
# Copyright ©  2015 1&1 Group <git@1and1.com>
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import, unicode_literals, print_function

import re
import difflib
try:
    import html.entities as htmlentitydefs
except ImportError:  # Python 2
    import htmlentitydefs  # pylint: disable=import-error,wrong-import-order
from xml.sax.saxutils import quoteattr  # pylint: disable=wrong-import-order

import arrow
from munch import munchify as bunchify
from lxml.etree import fromstring, HTMLParser, XMLParser, XMLSyntaxError  # pylint: disable=no-name-in-module
from rudiments.reamed import click

from .._compat import BytesIO


# Mapping of CLI content format names to Confluence API names
CLI_CONTENT_FORMATS = dict(view='view', editor='editor', storage='storage', export='export_view', anon='anonymous_export_view')

# Simple replacement rules, order is important!
TIDY_REGEX_RULES = ((_name, re.compile(_rule), _subst) for _name, _rule, _subst in [
    ("FosWiki: Remove CSS class from section title",
     r'<(h[1-5]) class="[^"]*">', r'<\1>'),
    ("FosWiki: Remove static section numbering",
     r'(?<=<h.>)(<a name="[^"]+?"></a>|)[0-9.]+?\s*(?=<span class="tok">&nbsp;</span>)', r'\1'),
    ("FosWiki: Empty anchor in headers",
     r'(?<=<h.>)<a></a>\s* +', ''),
    ("FosWiki: 'tok' spans in front of headers",
     r'(?<=<h.>)(<a name="[^"]+?"></a>|)\s*<span class="tok">&nbsp;</span>', r'\1'),
    ("FosWiki: Section edit icons at the end of headers",
     r'\s*<a(?: class="[^"]*")? href="[^"]+"(?: title="[^"]*")?>'
     r'<ac:image [^>]+><ri:url ri:value="[^"]+/EditChapterPlugin/pencil.png" ?/>'
     r'</ac:image></a>(?=</span></h)', ''),
    ("FosWiki: 'Edit Chapter Plugin' spans (old)",
     r'(?<=<h.>)(<a name="[^"]+?"></a>|)\s*<span class="ecpHeading">'
     r'\s*([^<]+)(?:<br\s*/>)</span>\s*(?=</h.>)', r'\1\2'),
    ("FosWiki: 'Edit Chapter Plugin' spans (new)",
     r'(?<=<h.>)(<a name="[^"]+?"></a>|)\s*<span class="ecpHeading">'
     r'\s*([^<]+)(?:<br\s*/>)<a class="ecpEdit".+?</a></span>\s*(?=</h.>)', r'\1\2'),
    ("FosWiki: Residual leading whitespace in headers",
     r'(?<=<h.>)(<a name="[^"]+?"></a>|)\s* +', r'\1'),
    ("FosWiki: Replace TOC div with macro",
     r'(<a name="foswikiTOC" ?/>)?<div class="foswikiToc">.*?</div>', '''
          <ac:structured-macro ac:name="panel" ac:schema-version="1">
            <ac:parameter ac:name="title">Contents</ac:parameter>
            <ac:rich-text-body>
              <p>
                <ac:structured-macro ac:name="toc" ac:schema-version="1"/>
              </p>
            </ac:rich-text-body>
          </ac:structured-macro>'''),
    ("FosWiki: Replace TOC in a Twisty with Expand+TOC macro",
     r'<div class="twistyPlugin">.+?<big><strong>Table of Contents</strong></big></span></a></span></div>', '''
          <ac:structured-macro ac:name="expand" ac:schema-version="1">
            <ac:parameter ac:name="title">Table of Contents</ac:parameter>
            <ac:rich-text-body>
              <p>
                <ac:structured-macro ac:name="toc" ac:schema-version="1"/>
              </p>
            </ac:rich-text-body>
          </ac:structured-macro>'''),
    ("FosWiki: Named anchors (#WikiWords)",
     r'(<a name=[^>]+></a><a href=")http[^#]+(#[^"]+" style="[^"]+)(" title="[^"]+"><big>[^<]+</big></a>)',
     r'\1\2; float: right;\3'),
    ("FosWiki: Wrap HTML '<pre>' into 'panel' macro",
     r'(?<!<ac:rich-text-body>)(<pre(?: class="[^"]*")?>)',
     r'<ac:structured-macro ac:name="panel" ac:schema-version="1">'
     r'<ac:parameter ac:name="bgColor">#eeeeee</ac:parameter>'
     r'<ac:rich-text-body>'
     r'\1'),
    ("FosWiki: Wrap HTML '</pre>' into 'panel' macro",
     r'</pre>(?!</ac:rich-text-body>)', '</pre></ac:rich-text-body></ac:structured-macro>'),
    ("FosWiki: Embedded CSS - custom list indent",
     r'<ul style="margin-left: [.0-9]+em;">', '<ul>'),
    ("FosWiki: Empty paragraphs",
     r'<p>&nbsp;</p>', r''),
    ("FosWiki: Obsolete CSS classes",
     r'(<(?:div|p|span|h[1-5])) class="(foswikiTopic)"', r'\1'),
])


def _apply_tidy_regex_rules(body, log=None):
    """Return tidied body after applying regex rules."""
    body = body.replace(u'\u00A0', '&nbsp;')
    for name, rule, subst in TIDY_REGEX_RULES:
        length = len(body)
        try:
            body, count = rule.subn(subst, body)
        except re.error as cause:
            raise click.LoggedFailure('Error "{}" in "{}" replacement: {} => {}'.format(
                cause, name, rule.pattern, subst,
            ))
        if count and log:
            length -= len(body)
            log.info('Replaced %d matche(s) of "%s" (%d chars %s)',
                     count, name, abs(length), "added" if length < 0 else "removed")
    return body


def _make_etree(body, content_format='storage', attrs=None):
    """Create an ElementTree from a page's body."""
    attrs = (attrs or {}).copy()
    attrs.update({
        'xmlns:ac': 'http://www.atlassian.com/schema/confluence/4/ac/',
        'xmlns:ri': 'http://www.atlassian.com/schema/confluence/4/ri/',
    })
    xml_body = re.sub(r'&(?!(amp|lt|gt|quot|apos))([a-zA-Z0-9]+);',
                  lambda cref: '&#{};'.format(htmlentitydefs.name2codepoint[cref.group(2)]), body)
    #print(body.encode('utf8'))
    xmldoc = u'<{root} {attrs}>{body}</{root}>'.format(
        root=content_format,
        attrs=' '.join('{}={}'.format(k, quoteattr(v)) for k, v in sorted(attrs.items())),
        body=xml_body)

    parser = (XMLParser if content_format == 'storage' else HTMLParser)(remove_blank_text=True)
    try:
        return fromstring(xmldoc, parser)
    except XMLSyntaxError as cause:
        raise click.LoggedFailure('{}\n{}'.format(
            cause, '\n'.join(['{:7d} {}'.format(i+1, k) for i, k in enumerate(xmldoc.splitlines())])
        ))


def _pretty_xml(body, content_format='storage', attrs=None):
    """Pretty-print the given page body and return a list of lines."""
    root = _make_etree(body, content_format=content_format, attrs=attrs)
    prettyfied = BytesIO()
    root.getroottree().write(prettyfied, encoding='utf8', pretty_print=True, xml_declaration=False)
    return prettyfied.getvalue().decode('utf8').splitlines()


[docs]class ConfluencePage(object):
    """A page that holds enough state so it can be modified."""

    DIFF_COLS = {
        '+': 'green',
        '-': 'red',
        '@': 'yellow',
    }

    def __init__(self, cf, url, markup='storage', expand=None):
        """ Load the given page.
        """
        if expand and isinstance(expand, str):
            expand = expand.split(',')
        expand = set(expand or []) | {'space', 'version', 'body.' + markup}

        self.cf = cf
        self.url = url
        self.markup = markup
        self._data = cf.get(self.url, expand=','.join(expand))
        self.body = self._data.body[self.markup].value

    @property
    def page_id(self):
        """The numeric page ID."""
        return self._data.id

    @property
    def space_key(self):
        """The space this page belongs to."""
        return self._data.space.key

    @property
    def title(self):
        """The page's title."""
        return self._data.title

    @property
    def json(self):
        """The full JSON response data."""
        return self._data

    @property
    def version(self):
        """The page's version number in history."""
        return self._data.version.number

[docs]    def etree(self):
        """Parse the page's body into an ElementTree."""
        attrs = {
            'id': 'page-' + self._data.id,
            'href': self._data._links.base + (self._data._links.tinyui or ''),
            'status': self._data.status,
            'title': self._data.title,
        }
        return _make_etree(self.body, content_format=self.markup, attrs=attrs)

[docs]    def tidy(self, log=None):
        """Return a tidy copy of this page's body."""
        assert self.markup == 'storage', "Can only clean up pages in storage format!"
        return _apply_tidy_regex_rules(self.body, log=log)

[docs]    def update(self, body=None, minor=True):
        """Update a page's content."""
        assert self.markup == 'storage', "Cannot update non-storage page markup!"
        if body is None:
            body = self.body
        if body == self._data.body[self.markup].value:
            return  # No changes

        data = {
            #'id': self._data.id,
            'type': 'page',
            'space': {'key': self.space_key},
            'title': self.title,
            'version': dict(number=self.version + 1, minorEdit=minor),
            'body': {
                'storage': {
                    'value': body,
                    'representation': self.markup,
                }
            },
            'expand': 'version',
        }
        response = self.cf.session.put(self._data._links.self, json=data)
        response.raise_for_status()
        ##page = response.json(); print(page)
        result = bunchify(response.json())
        self._data.body[self.markup].value = body
        self._data.version = result.version
        return result


[docs]    def dump_diff(self, changed):
        """Dump a diff to terminal between changed and stored body."""
        if self.body == changed:
            click.secho('=== No changes to "{0}"'.format(self.title), fg='green')
            return

        diff = difflib.unified_diff(
            _pretty_xml(self.body, self.markup),
            _pretty_xml(changed, self.markup),
            u'v. {0} of "{1}"'.format(self.version, self.title),
            u'v. {0} of "{1}"'.format(self.version + 1, self.title),
            arrow.get(self._data.version.when).replace(microsecond=0).isoformat(sep=' '),
            arrow.now().replace(microsecond=0).isoformat(sep=' '),
            lineterm='', n=2)
        for line in diff:
            click.secho(line, fg=self.DIFF_COLS.get(line and line[0], None))