如何将CSV / TSV数据导入Couch DB?
如何将CSV / TSV数据导入Couch DB?
使用python非常容易。
#!/usr/bin/env python
from couchdbkit import Server, Database
from couchdbkit.loaders import FileSystemDocsLoader
from csv import DictReader
import sys, subprocess, math, os
def parseDoc(doc):
for k,v in doc.items():
if (isinstance(v,str)):
#print k, v, v.isdigit()
# #see if this string is really an int or a float
if v.isdigit()==True: #int
doc[k] = int(v)
else: #try a float
try:
if math.isnan(float(v))==False:
doc[k] = float(v)
except:
pass
return doc
def upload(db, docs):
db.bulk_save(docs)
del docs
return list()
def uploadFile(fname, uri, dbname):
print 'Upload contents of %s to %s/%s' % (fname, uri, dbname)
# #connect to the db
theServer = Server(uri)
db = theServer.get_or_create_db(dbname)
#loop on file for upload
reader = DictReader(open(fname, 'rU'), dialect = 'excel') #see the python csv module
#for other options, such as using the tab delimeter. The first line in your csv
#file should contain all of the "key" and all subsequent lines hold the values
#for those keys.
#used for bulk uploading
docs = list()
checkpoint = 100
for doc in reader:
newdoc = parseDoc(doc) #this just converts strings that are really numbers into ints and floats
#Here I check to see if the doc is already on the database. If it is, then I assign
#the _rev key so that it updates the doc on the db.
if db.doc_exist(newdoc.get('_id')):
newdoc['_rev'] = db.get_rev(newdoc.get('_id'))
docs.append(newdoc)
if len(docs)%checkpoint==0:
docs = upload(db,docs)
#don't forget the last batch
docs = upload(db,docs)
if __name__=='__main__':
filename = sys.argv[1]
uri = sys.argv[2]
dbname = sys.argv[3]
uploadFile(filename, uri, dbname)
使用python非常容易。
#!/usr/bin/env python
from couchdbkit import Server, Database
from couchdbkit.loaders import FileSystemDocsLoader
from csv import DictReader
import sys, subprocess, math, os
def parseDoc(doc):
for k,v in doc.items():
if (isinstance(v,str)):
#print k, v, v.isdigit()
# #see if this string is really an int or a float
if v.isdigit()==True: #int
doc[k] = int(v)
else: #try a float
try:
if math.isnan(float(v))==False:
doc[k] = float(v)
except:
pass
return doc
def upload(db, docs):
db.bulk_save(docs)
del docs
return list()
def uploadFile(fname, uri, dbname):
print 'Upload contents of %s to %s/%s' % (fname, uri, dbname)
# #connect to the db
theServer = Server(uri)
db = theServer.get_or_create_db(dbname)
#loop on file for upload
reader = DictReader(open(fname, 'rU'), dialect = 'excel') #see the python csv module
#for other options, such as using the tab delimeter. The first line in your csv
#file should contain all of the "key" and all subsequent lines hold the values
#for those keys.
#used for bulk uploading
docs = list()
checkpoint = 100
for doc in reader:
newdoc = parseDoc(doc) #this just converts strings that are really numbers into ints and floats
#Here I check to see if the doc is already on the database. If it is, then I assign
#the _rev key so that it updates the doc on the db.
if db.doc_exist(newdoc.get('_id')):
newdoc['_rev'] = db.get_rev(newdoc.get('_id'))
docs.append(newdoc)
if len(docs)%checkpoint==0:
docs = upload(db,docs)
#don't forget the last batch
docs = upload(db,docs)
if __name__=='__main__':
filename = sys.argv[1]
uri = sys.argv[2]
dbname = sys.argv[3]
uploadFile(filename, uri, dbname)
Apache CouchDB仅存储JSON文档。因此,要导入CSV,您必须转换为单个JSON文档,然后正常POST它们。
您可能必须编写一个程序来遍历每一行。转换CSV行(序列号 值)进入JSON文档(一系列的 核心价值 对)。然后使用HTTP将其发送到CouchDB。
我在这里用这个: https://开头的github / glynnbird / couchimport。如果您的CSV相当简单,那么只需设置一个数据库名称并将CSV格式化为couchimport即可。
刚刚用Ruby写了一个脚本: csv2couchdb