PythonTip >> 博文 >> python

python实现批量文件夹网页的简繁转换

zihua 2014-04-03 15:04:36 点击: 920 | 收藏


1、下载github上面的关于简繁转换的开源包:https://github.com/skydark/nstools/
2、添加langconv.py和zh_wiki.py到项目中
3、以下是源码:

#!/usr/bin/env python
# coding=utf-8
# __author__ = 'zhoujie'

import os, shutil, re, time, urlparse
from copy import deepcopy
from langconv import *


#源文件夹
SCANDIR="D:\\1\\11"
#输出文件夹
OUTDIR="D:\\1\\22"

#扫描文件夹得到所有文件
def scanfile(scanpath):
	for root, dirs, files in os.walk(scanpath):
	   for file in files:
			print "------------------------"
			sourfile = os.path.join(root, file)
			print sourfile
			if sourfile.endswith("htm"):	#如果是html进行业务逻辑处理
				handlehtml(sourfile)
			tofile = sourfile.replace(SCANDIR,OUTDIR)
			todir = tofile.replace(file,"")
			if not os.path.isdir(todir):
				os.makedirs(todir)
			shutil.copy(sourfile, tofile)
			os.remove(sourfile)


#处理hTML文件,进行繁简转换
def handlehtml(sourcefile):
	#读取文件内容
	file_object = open(sourcefile,'r')
	try:
		 htmlcontent = file_object.read()
		 htmlcontent = Converter('zh-hant').convert(htmlcontent.decode('utf-8'))
		 htmlcontent = htmlcontent.encode('utf-8')
	finally:
		 file_object.close( )
	#进行繁体转换
	try:
		file_object = open(sourcefile, 'w')
		file_object.write(htmlcontent)
	finally:
		file_object.close( )
	#TODO 进行URL的替换处理


if __name__=="__main__":
	while True:
		#scanfile(SCANDIR)
		time.sleep(5)
		print 'do loop done and sleep 5'
原文链接:http://www.tuicool.com/articles/yuEjUv

作者:zihua | 分类: python | 标签: python | 阅读: 920 | 发布于: 2014-04-03 15时 |