IEWebArchive¶
A PyObjC Example without documentation
Sources¶
MHTDocument.py¶
import objc
from Cocoa import NSDocument
from loader import MHTLoader
class MHTDocument(NSDocument):
locationbox = objc.IBOutlet()
webview = objc.IBOutlet()
path = None
statusText = None
@objc.IBAction
def navigateHistory_(self, sender):
if sender.selectedSegment() == 0:
self.webview.goBack_(sender)
else:
self.webview.goForward_(sender)
def windowNibName(self):
return "MHTDocument"
def readFromFile_ofType_(self, path, tp):
if self.webview is None:
self.path = path
else:
self.readMHT_(path)
return True
def writeToFile_ofType_(self, path, tp):
# TODO: "save-as" functionality
return False
def windowControllerDidLoadNib_(self, controller):
if self.path:
self.readMHT_(self.path)
def readMHT_(self, path):
self.mht = MHTLoader(path)
self.locationbox.setStringValue_(self.mht.fixupURL(self.mht.root))
archive = self.mht.asWebArchive()
print("Archive", archive.description())
with open("/tmp/archive.webarchive", "wb") as fp:
fp.write(archive.data().bytes())
self.webview.mainFrame().stopLoading()
self.webview.mainFrame().loadArchive_(archive)
1 / 0
loader.py¶
import email
from Cocoa import NSURL, NSData, NSString
from WebKit import WebArchive, WebResource
# def loadMHT(filename):
# """
# Load a .HMT HTML archive and return the WebArchive representation.
# """
# return HMTLoad(filename).asWebArchive()
class MHTLoader:
"""
A loader for .mht files, and archive format used by MS Internet Explorer
on Windows.
"""
def __init__(self, filename):
self.filename = filename
# root of the archive (index into self.parts)
self.root = None
# filename -> (content-type, data)
self.parts = {}
self.loadFile(filename)
def loadFile(self, filename):
with open(filename) as fp:
msg = email.message_from_file(fp)
for part in msg.walk():
if part.get_content_maintype() == "multipart":
continue
filename = part.get("Content-Location")
contentType = part.get_content_type()
data = part.get_payload(decode=True)
self.parts[filename] = (contentType, data)
if self.root is None:
self.root = filename
def fixupURL(self, url):
# IE creates MHT files with file: URLS containing backslashes,
# NSURL insists that those are invalid, replace backslashes by
# forward slashes.
if url.startswith("file:"):
return url.replace("\\", "/")
else:
return url
def asWebArchive(self):
"""
Convert the MHT archive to a webarchive.
"""
rootType, rootText = self.parts[self.root]
pageResource = WebResource.alloc().initWithData_URL_MIMEType_textEncodingName_frameName_( # noqa: B950
NSData.dataWithBytes_length_(rootText.replace(b"\\", b"/"), len(rootText)),
NSURL.URLWithString_(self.fixupURL(self.root)),
NSString.stringWithString_(rootType),
None,
None,
)
resources = []
for url in self.parts:
if url == self.root:
continue
tp, data = self.parts[url]
resources.append(
WebResource.alloc().initWithData_URL_MIMEType_textEncodingName_frameName_(
NSData.dataWithBytes_length_(data, len(data)),
NSURL.URLWithString_(self.fixupURL(url)),
NSString.stringWithString_(tp),
None,
None,
)
)
return WebArchive.alloc().initWithMainResource_subresources_subframeArchives_(
pageResource, resources, None
)
def main():
# Testing...
p = MHTLoader("python-home.mht")
a = p.asWebArchive()
with open("python-home.webarchive", "wb") as fp:
fp.write(a.data().bytes())
if __name__ == "__main__":
main()
main.py¶
import MHTDocument # noqa: F401
import objc
from PyObjCTools import AppHelper
objc.setVerbose(1)
AppHelper.runEventLoop()
setup.py¶
"""
Script for building the example.
Usage:
python3 setup.py py2app
"""
from setuptools import setup
plist = {
"CFBundleDocumentTypes": [
{
"CFBundleTypeExtensions": ["mht"],
"CFBundleTypeName": "Internet Explorer Web Archive",
"CFBundleTypeRole": "Editor",
"NSDocumentClass": "MHTDocument",
}
]
}
setup(
name="MHTViewer",
app=["main.py"],
data_files=["MainMenu.nib", "MHTDocument.nib"],
options={"py2app": {"plist": plist}},
setup_requires=["py2app", "pyobjc-framework-Cocoa", "pyobjc-framework-WebKit"],
)