This repository was archived by the owner on May 6, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 0
Expand file tree
/
Copy pathwxr-split.py
More file actions
executable file
·70 lines (54 loc) · 2.36 KB
/
wxr-split.py
File metadata and controls
executable file
·70 lines (54 loc) · 2.36 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
#!/usr/bin/python
# based on https://wordpress.org/support/topic/wxr-file-splitter/#post-1396182
# This script is designed to take a wordpress xml export file and split it into some
# number of chunks (2 by default). The number of lines per chunk is determined by counting
# the number of occurences of a particular line, '<item>\n' by default, and breaking up the
# such that each chunk has an equal number occurences of that line. The appropriate header
# and footer is added to each chunk.
import os
import sys
import math
if len(sys.argv) < 2:
print 'Please specify the name of wordpress export file you would like to split'
sys.exit(0)
try:
input_file = open(sys.argv[1], 'r')
lines = input_file.readlines()
(input_file_path, input_file_string) = os.path.split(sys.argv[1])
(input_file_name, input_file_extension) = os.path.splitext(input_file_string)
except IOError:
print 'Could not open file "%s".' % sys.argv[1]
sys.exit(0)
number_of_chunks = max(int(sys.argv[2]), 2) if len(sys.argv) > 2 else 2
line_delimiter = '<item>'
delimiter_count = 0
for line in lines:
if line_delimiter in line:
delimiter_count += 1
print ''
print 'File "%s" contains %s items' % (input_file_string, delimiter_count)
delimiter_count = 1.0*delimiter_count
delimiters_per_chunk = int(math.ceil(delimiter_count/number_of_chunks))
print 'Creating %s files with at most %s items each:' % (number_of_chunks, delimiters_per_chunk)
header = ""
footer = "\n</channel>\n</rss>\n"
chunk_number = 1
output_file_name = "%s_%s%s" % (input_file_name, chunk_number, input_file_extension)
output_file = open('json-result/' + output_file_name, 'w')
print ' Writing chunk %s to file %s...' % (chunk_number, output_file_name)
delimiter_count = 0
for line in lines:
if line_delimiter in line: delimiter_count += 1
if chunk_number is 1 and delimiter_count is 0: header += line
if delimiter_count > delimiters_per_chunk:
output_file.write(footer)
output_file.close()
chunk_number += 1
delimiter_count = 1
output_file_name = "%s_%s%s" % (input_file_name, chunk_number, input_file_extension)
output_file = open('json-result/' + output_file_name, 'w')
print ' Writing chunk %s to file %s...' % (chunk_number, output_file_name)
output_file.write(header)
output_file.write(line)
output_file.close()
print 'Done!\n'