网页截图实现关键词高亮

2018-02-27 09:01:51来源:cnblogs.com作者:enrio人点击

分享

Web截图实现关键词高亮

代码

from selenium import webdriverimport timeimport sysimport refrom PIL import Imagefrom io import BytesIODEBUG_MODE = Truewith open("jquery.min.js") as jquery_file:    JQUERY_SCRIPT = jquery_file.read()def log(msg):    if DEBUG_MODE:        print(msg)def capture_with_highlight(url, save_fn="capture.png", keywords=None):    def init_browser():        browser = webdriver.PhantomJS(service_log_path="log/"+save_fn+".log")        browser.set_window_size(1400, 900)        browser.set_page_load_timeout(40)          browser.set_script_timeout(40)        return browser    browser=init_browser()                                     log("正在打开页面:"+url)    browser.get(url)  # Load page    log("执行滚动脚本")    browser.execute_script("""        var y = 0;        var step = 100;        window.scroll(0, 0);        function f() {            if (y < document.body.scrollHeight) {            y += step;            window.scroll(0, y);            setTimeout(f, 100);            } else {            window.scroll(0, 0);            document.title += "scroll-done";            }        }        setTimeout(f, 1000);    """)    for i in range(30):        if "scroll-done" in browser.title:            break        time.sleep(1)    log("滚动完成,判定关键词命中并添加高亮")    if keywords:        reg = "(" + ")|(".join(keywords) + ")"        def loading_jquery(browser):            has_jq = browser.execute_script(                """return typeof(jQuery)!="undefined" """)            if not has_jq:                log("加载jquery脚本")                browser.execute_script(JQUERY_SCRIPT)        loading_jquery(browser)        has_hit = browser.execute_script(            """return $('body')&&/%s/.test($('body').text())""" % reg)        if not has_hit:            log("未在页面上找到关键词")            browser.close()            return False        browser.execute_script("""            var addHighlight=function(){                var re=RegExp("%s")                var hits = jQuery('body').find("*")                                    .filter(function () {                                        var obj = jQuery(this).clone();                                        obj.find(':nth-child(n)').remove();                                        return re.test(obj.text());                                     })                re.global = true                hits.each(function () {                    var html = jQuery(this).html()                    html = html.replace(re, "<span style='color:yellow;font-size:1.5em;background-color:red;font-weight:bold'>%s</span>")                    jQuery(this).html(html)                });            }            addHighlight()            """ % (reg, "".join(["$%s" % (i + 1) for i in range(len(keywords))])))    png_data = browser.get_screenshot_as_png()    img = Image.open(BytesIO(png_data))    background = Image.new("RGB", img.size, (255, 255, 255))    background.paste(img, mask=img.split()[3])    background.save("img/"+save_fn, 'JPEG', quality=80)    return Trueif __name__ == "__main__":    capture_with_highlight(        "https://item.jd.com/10124713723.html", keywords=["老字号"])

结果(部份截图)

流程图

最新文章

123

最新摄影

微信扫一扫

第七城市微信公众平台