IEWebArchive

A PyObjC Example without documentation

Sources

MHTDocument.py

import objc
from Cocoa import NSDocument
from loader import MHTLoader


class MHTDocument(NSDocument):
    locationbox = objc.IBOutlet()
    webview = objc.IBOutlet()

    path = None
    statusText = None

    @objc.IBAction
    def navigateHistory_(self, sender):
        if sender.selectedSegment() == 0:
            self.webview.goBack_(sender)
        else:
            self.webview.goForward_(sender)

    def windowNibName(self):
        return "MHTDocument"

    def readFromFile_ofType_(self, path, tp):
        if self.webview is None:
            self.path = path
        else:
            self.readMHT_(path)

        return True

    def writeToFile_ofType_(self, path, tp):
        # TODO: "save-as" functionality
        return False

    def windowControllerDidLoadNib_(self, controller):
        if self.path:
            self.readMHT_(self.path)

    def readMHT_(self, path):
        self.mht = MHTLoader(path)
        self.locationbox.setStringValue_(self.mht.fixupURL(self.mht.root))
        archive = self.mht.asWebArchive()
        print("Archive", archive.description())
        with open("/tmp/archive.webarchive", "wb") as fp:
            fp.write(archive.data().bytes())
        self.webview.mainFrame().stopLoading()
        self.webview.mainFrame().loadArchive_(archive)
        1 / 0

loader.py

import email

from Cocoa import NSURL, NSData, NSString
from WebKit import WebArchive, WebResource


# def loadMHT(filename):
#     """
#     Load a .HMT HTML archive and return the WebArchive representation.
#     """
#     return HMTLoad(filename).asWebArchive()


class MHTLoader:
    """
    A loader for .mht files, and archive format used by MS Internet Explorer
    on Windows.
    """

    def __init__(self, filename):
        self.filename = filename

        # root of the archive (index into self.parts)
        self.root = None

        # filename -> (content-type, data)
        self.parts = {}

        self.loadFile(filename)

    def loadFile(self, filename):
        with open(filename) as fp:
            msg = email.message_from_file(fp)

        for part in msg.walk():
            if part.get_content_maintype() == "multipart":
                continue

            filename = part.get("Content-Location")
            contentType = part.get_content_type()
            data = part.get_payload(decode=True)

            self.parts[filename] = (contentType, data)
            if self.root is None:
                self.root = filename

    def fixupURL(self, url):
        # IE creates MHT files with file: URLS containing backslashes,
        # NSURL insists that those are invalid, replace backslashes by
        # forward slashes.
        if url.startswith("file:"):
            return url.replace("\\", "/")
        else:
            return url

    def asWebArchive(self):
        """
        Convert the MHT archive to a webarchive.
        """
        rootType, rootText = self.parts[self.root]
        pageResource = WebResource.alloc().initWithData_URL_MIMEType_textEncodingName_frameName_(  # noqa: B950
            NSData.dataWithBytes_length_(rootText.replace(b"\\", b"/"), len(rootText)),
            NSURL.URLWithString_(self.fixupURL(self.root)),
            NSString.stringWithString_(rootType),
            None,
            None,
        )

        resources = []
        for url in self.parts:
            if url == self.root:
                continue

            tp, data = self.parts[url]
            resources.append(
                WebResource.alloc().initWithData_URL_MIMEType_textEncodingName_frameName_(
                    NSData.dataWithBytes_length_(data, len(data)),
                    NSURL.URLWithString_(self.fixupURL(url)),
                    NSString.stringWithString_(tp),
                    None,
                    None,
                )
            )

        return WebArchive.alloc().initWithMainResource_subresources_subframeArchives_(
            pageResource, resources, None
        )


def main():
    # Testing...
    p = MHTLoader("python-home.mht")
    a = p.asWebArchive()
    with open("python-home.webarchive", "wb") as fp:
        fp.write(a.data().bytes())


if __name__ == "__main__":
    main()

main.py

import MHTDocument  # noqa: F401
import objc
from PyObjCTools import AppHelper

objc.setVerbose(1)

AppHelper.runEventLoop()

setup.py

"""
Script for building the example.

Usage:
    python3 setup.py py2app
"""

from setuptools import setup

plist = {
    "CFBundleDocumentTypes": [
        {
            "CFBundleTypeExtensions": ["mht"],
            "CFBundleTypeName": "Internet Explorer Web Archive",
            "CFBundleTypeRole": "Editor",
            "NSDocumentClass": "MHTDocument",
        }
    ]
}

setup(
    name="MHTViewer",
    app=["main.py"],
    data_files=["MainMenu.nib", "MHTDocument.nib"],
    options={"py2app": {"plist": plist}},
    setup_requires=["py2app", "pyobjc-framework-Cocoa", "pyobjc-framework-WebKit"],
)