Jeswang's Blog

盲目跟随还是独立去做,To be or not to be?

下载不让直接下载的 SlideShare

| Comments

需求

最近看到的一篇比较实用的关于生产力的文章:[Quora][翻譯] 有哪些可以應用到每日生活的省時妙招呢?,内容是翻译 Quora 上的一个答案。我结合自己实践的结果,特别认同其中提到的一个观点:睡好、吃好、多运动。其他观点还需要好好实践一下才能体会出作者的用意。

文章中还提到了一个幻灯片:Productivity porn

内容很不错,但是作者不让下载。

slideshare_download

解决方案

但是这难不倒程序员,图片都有了,自己抓下来生成 PDF 就行了。搜了一下,果然有人实践过,代码如下(需要在 Mac / Linux 下使用的需要注释一下相应生成 PPT 的代码):

(slideshare-dl.py) download
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
#!/usr/bin/env python
# -*- coding: utf-8 -*-

"""
slideshare-dl.py
~~~~~~~~~~~~~~~~

slideshare-dl is a small command-line program 
for downloading slides from SlideShare.net

"""

import os
import re
import urllib2

from BeautifulSoup import BeautifulSoup
from xml.etree import ElementTree as ET
from reportlab.pdfgen import canvas
from reportlab.lib.pagesizes import A4

import win32com.client #Only Windows for generating ppt
from PIL import Image

class SlideShare(object):
  """SlideShare download script"""
  def __init__(self, url=None):
      self.url = url
      self.__xml_file = ''
      self.__slide_name = ''
      self.__files = []
      self.__images = []

  def set_xml_file(self):
      url = urllib2.urlopen(self.url)
      source = url.read()

      soup = BeautifulSoup(source)
      html = soup.find("script", {"id": "page-json"})
      
      slide_regex = re.search('"doc":"(.*?)"', str(html), re.IGNORECASE)
      self.__slide_name = str(slide_regex.group(1))
      self.__xml_file = "http://s3.amazonaws.com/slideshare/" + self.__slide_name + ".xml"

  def create_directory(self, dir_name):
      if not os.path.exists(dir_name):
          os.makedirs(dir_name)
      os.chdir(dir_name)

  def files_from(self, xml_file):
      files = []
      try:
          url = urllib2.urlopen(xml_file)
          tree = ET.parse(url)
          element = tree.getroot()

          for subelement in element:
              files.append(str(subelement.get('Src')))
          return files
      except Exception, inst:
          print "Unexpected error opening xml file"

  def download_file(self, url):
      file_name = url.split('/')[-1]
      u = urllib2.urlopen(url)
      f = open(file_name, 'wb')
      meta = u.info()
      file_size = int(meta.getheaders("Content-Length")[0])
      print "Downloading: %s Bytes: %s" % (file_name, file_size)

      file_size_dl = 0
      block_sz = 8192
      while True:
          buffer = u.read(block_sz)
          if not buffer:
              break
          
          file_size_dl += len(buffer)
          f.write(buffer)
          status = r"%10d  [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
          status = status + chr(8)*(len(status)+1)
          print status,
      f.close()
      self.__files.append(file_name)

  def download(self):
      for url in self.files_from(self.__xml_file):
          self.download_file(url)
      
  def convert_to_images(self):
      for filename in self.__files:
          #swfrender path/to/my.swf -X<width of output> -Y<height of output> -o<filename of output png>
          swfrender_cmd = 'swfrender ' + os.getcwd() + '/' + filename + ' -o ' + os.path.splitext(filename)[0] + '.png'
          os.system(swfrender_cmd)
          self.__images.append(os.path.splitext(filename)[0] + '.png')

  def generate_pdf(self):
      pdf_name = self.__slide_name + ".pdf"
      print "Generating PDF..."
      aux = canvas.Canvas(pdf_name, pagesize = A4)
      lWidth, lHeight = A4
      aux.setPageSize((lHeight, lWidth)) #landscape
      #aux.setPageSize((lWidth, lHeight)) # portrait

      for filename in self.__images:
          image = os.getcwd() + '/' + filename
          #canvas.drawImage(self, image, x,y, width=None,height=None,mask=None)
          aux.drawImage(image, 60, 10) # 400,0,130,150
          aux.showPage()
      aux.save()
      print "Done."
  
  def generate_ppt(self):
      pdf_name = self.__slide_name + ".ppt"
      print "Generatin PPT..."
      ppLayoutBlank = 12 # Slide Type's
      Application = win32com.client.Dispatch("PowerPoint.Application")
      Application.Visible = True
      Presentation = Application.Presentations.Add();

      for filename in reversed(self.__images):
          pictName = os.getcwd() + '/' + filename
          im = Image.open(pictName)
          width, height = im.size
          Slide1 = Presentation.Slides.Add(1, ppLayoutBlank);
          Pict1 = Slide1.Shapes.AddPicture(FileName=pictName, LinkToFile=False, SaveWithDocument=True, Left=0, Top=0, Width=width, Height=height)
      print "Done."

      Presentation.SaveAs(os.getcwd() + '/' + self.__slide_name + ".pptx");
      Application.Quit()


  def get(self, url):
      self.url = url
      self.set_xml_file()
      self.create_directory(self.__slide_name)
      self.download()
      self.convert_to_images()
      self.generate_pdf()
      self.generate_ppt()

def main():
  slide = SlideShare()
  #slide.get("http://www.slideshare.net/oisin/simple-web-services-with-sinatra-and-heroku-6882369")
  #slide.get("http://www.slideshare.net/barrasozky/miembros")
  #slide.get("http://www.slideshare.net/RobleJose/vectorgrunge")
  #slide.get("http://www.slideshare.net/david.motta/modelo-del-negocio-con-rup-y-uml-parte-1")
  #slide.get("http://www.slideshare.net/david.motta/modelo-del-negocio-con-rup-y-uml-parte-3")
  slide.get("http://www.slideshare.net/david.motta/modelo-del-negocio-con-rup-y-uml-parte-3-1534304")
  # suggest it
  #arc = raw_input("Ingrese url: ")
  #print arc
  #slide.get(""+arc)
if __name__ == "__main__":
  main()

代码出处:slideshare-dl is a small command-line program for downloading slides from SlideShare.net

PS: 据 Slideshare 评论里作者的回复,46 - 51 页码里的小本子是作者自己用 InDesign 做的,足见他是有多喜欢探索提高效率的方法。

- EOF -

Comments