Hi Rob,
Thanks for the class I assume you created it in the “example_watch_folder.py” or whatever you may have renamed it too? Can you send me the full python
script if possible ?
I modified example_watch_folder.py as followed (using your code):
if __name__ == '__main__':
try:
api_key = sys.argv[1]
api_url = sys.argv[2]
#in_folder = sys.argv[3]
#out_folder = sys.argv[4]
fullpaths = sys.argv[3]
data_library = sys.argv[4]
workflow = sys.argv[5]
except IndexError:
print 'usage: %s key url in_folder out_folder data_library workflow' % os.path.basename( sys.argv[0] )
sys.exit( 1 )
#main(api_key, api_url, in_folder, out_folder, data_library, workflow )
main(api_key, api_url, fullpaths, data_library, workflow )
#def main(api_key, api_url, in_folder, out_folder, data_library, workflow):
def main(api_key, api_url, fullpaths, data_library, workflow):
...
while 1:
#Galaxy needs to read the pathnames as a new line delimited string
#so we do that transformation here
print fullpaths
fullpaths_string = ""
for path in fullpaths:
fullpaths_string = fullpaths_string + path + "\n"
fullpaths_string = fullpaths_string[:-1]
data = {}
data['folder_id'] = library_folder_id
data['file_type'] = 'auto'
data['dbkey'] = ''
data['upload_option'] = 'upload_paths'
data['filesystem_paths'] = fullpaths_string
data['create_type'] = 'file'
print "before libset "
#Start the upload. This will return right away, but it may take awhile
libset = submit(api_key, api_url + "libraries/%s/contents" % library_id, data, return_formatted = False)
print "after libset "
#Iterate through each dataset we just uploaded and block until all files have been written to the Galaxy database
for ds in libset:
last_filesize = 0
while True:
#If file_size != 0 and the file_size is different after a second iteration, then we assume the disk write is finished
ds_id = ds['id']
uploaded_file = display(api_key, api_url + 'libraries/%s/contents/%s' %(library_id, ds_id), return_formatted=False)
print uploaded_file
if uploaded_file['file_size'] != 0 and uploaded_file['file_size'] == last_filesize:
break
else:
last_filesize = uploaded_file['file_size']
time.sleep(2)
However, when I run this I get the following output i.e. there is a new line after each character should you not use os.path.dirname:
./milxview_watch_folder.py de5f19fcf64a47ca9b61cfc3bf41490c
http://barium-rbh/csiro/api/ "/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T1_Screening.nii.gz,/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz" "This One" f2db41e1fa331b3e
/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T1_Screening.nii.gz,/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz
/
h
o
m
e
/
g
a
l
a
x
y
/
g
a
l
a
x
y
-
d
r
o
p
/
i
n
p
u
t
/
1
4
1
_
S
_
0
8
5
1
_
M
R
I
_
T
1
_
S
c
r
e
e
n
i
n
g
.
n
i
i
.
g
z
,
/
h
o
m
e
/
g
a
l
a
x
y
/
g
a
l
a
x
y
-
d
r
o
p
/
i
n
p
u
t
/
1
4
1
_
S
_
0
8
5
1
_
M
R
I
_
T
2
_
S
c
r
e
e
n
i
n
g
.
n
i
i
.
g
z
/
h
o
m
e
/
g
a
l
a
x
y
/
g
a
l
a
x
y
-
d
r
o
p
/
i
n
p
u
t
/
1
4
1
_
S
_
0
8
5
1
_
M
R
I
_
T
1
_
S
c
r
e
e
n
i
n
g
.
n
i
i
.
g
z
,
/
h
o
m
e
/
g
a
l
a
x
y
/
g
a
l
a
x
y
-
d
r
o
p
/
i
n
p
u
t
/
1
4
1
_
S
_
0
8
5
1
_
M
R
I
_
T
2
_
S
c
r
e
e
n
i
n
g
.
n
i
i
.
g
z
before libset
after libset
Traceback (most recent call last):
File "./milxview_watch_folder.py", line 127, in <module>
main(api_key, api_url, fullpaths, data_library, workflow )
File "./milxview_watch_folder.py", line 70, in main
ds_id = ds['id']
TypeError: string indices must be integers, not str
From: Rob Leclerc [mailto:robert.leclerc@gmail.com]
Sent: Wednesday, 29 May 2013 11:38 PM
To: Burdett, Neil (ICT Centre, Herston - RBWH)
Cc: galaxy-dev@lists.bx.psu.edu; Dannon Baker
Subject: Re: Creating multiple datasets in a libset
Hi Neil,
I've attached my class function for uploading multiple files.
def upload_files(self, fullpaths):
"""
Uploads files from a disk location to a Galaxy library
Accepts an array of full path filenames
Example: fullpaths = ['/home/username/file1.txt', '/home/username/files2.txt']
"""
if self.jsonstring == None:
self.get_library()
library_id = self.library_id
library_folder_id = self.library_folder_id
api_key = self.api_key
api_url = self.api_url
#Galaxy needs to read the pathnames as a new line delimited string
#so we do that transformation here
fullpaths_string = ""
for path in fullpaths:
fullpaths_string = fullpaths_string + path + "\n"
fullpaths_string = fullpaths_string[:-1]
data = {}
data['folder_id'] = library_folder_id
data['file_type'] = 'auto'
data['dbkey'] = ''
data['upload_option'] = 'upload_paths'
data['filesystem_paths'] = fullpaths_string
data['create_type'] = 'file'
#Start the upload. This will return right away, but it may take awhile
libset = submit(api_key, api_url + "libraries/%s/contents" % library_id, data, return_formatted = False)
#Iterate through each dataset we just uploaded and block until all files have been written to the Galaxy database
for ds in libset:
last_filesize = 0
while True:
#If file_size != 0 and the file_size is different after a second iteration, then we assume the disk write is finished
ds_id = ds['id']
uploaded_file = display(api_key, api_url + 'libraries/%s/contents/%s' %(library_id, ds_id), return_formatted=False)
print uploaded_file
if uploaded_file['file_size'] != 0 and uploaded_file['file_size'] == last_filesize:
break
else:
last_filesize = uploaded_file['file_size']
time.sleep(2)
self.libset = libset
return libset
Rob Leclerc, PhD
P: (US) +1-(917)-873-3037
P: (Shanghai) +86-1-(861)-612-5469
Personal Email: rob.leclerc@aya.yale.edu
On Wed, May 29, 2013 at 12:45 AM, <Neil.Burdett@csiro.au> wrote:
Hi Guys,
Did you manage to get multiple datasets working? I can't seem to upload multiple files. Only the last file appears in the history. I changed my code as mentioned in the thread below in "example_watch_folder.py" to add multiple files separated by a
new line and increased the sleep time:
for fname in os.listdir(in_folder):
fullpath = os.path.join(in_folder, fname)
print ' fullpath is [%s] ' % fullpath
if os.path.isfile(fullpath):
data = {}
data['folder_id'] = library_folder_id
data['file_type'] = 'auto'
data['dbkey'] = ''
data['upload_option'] = 'upload_paths'
data['filesystem_paths'] = "/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T1_Screening.nii.gz\n /home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz"
print ' data is [%s] ' % str(data['filesystem_paths'])
data['create_type'] = 'file'
libset = submit(api_key, api_url + "libraries/%s/contents" % library_id, data, return_formatted = False)
#TODO Handle this better, but the datatype isn't always
# set for the followup workflow execution without this
# pause.
time.sleep(65)
However, I get the following crash:
./example_watch_folder.py 64f3209856a3cf4f2d034a1ad5bf851c
http://barium-rbh/csiro/api/ /home/galaxy/galaxy-drop/input /home/galaxy/galaxy-drop/output "This One" f2db41e1fa331b3e
fullpath is [/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz]
data is [/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T1_Screening.nii.gz
/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz]
url is :
http://barium-rbh/csiro/api/libraries/33b43b4e7093c91f/contents?key=64f3209856a3cf4f2d034a1ad5bf851c
data is : {'file_type': 'auto', 'dbkey': '', 'create_type': 'file', 'folder_id': 'F33b43b4e7093c91f', 'upload_option': 'upload_paths', 'filesystem_paths': '/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T1_Screening.nii.gz\n /home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz'}
url is :
http://barium-rbh/csiro/api/workflows?key=64f3209856a3cf4f2d034a1ad5bf851c
data is : {'workflow_id': 'f2db41e1fa331b3e', 'ds_map': {'14': {'src': 'ld', 'id': 'ff5476bcf6c921fa'}}, 'history': '141_S_0851_MRI_T2_Screening.nii.gz - apiFullCTE'}
{'outputs': ['daecbdd824e1c349', '358eb58cd5463e0d', 'c0279aab05812500'], 'history': '3cc0effd29705aa3'}
url is :
http://barium-rbh/csiro/api/workflows?key=64f3209856a3cf4f2d034a1ad5bf851c
data is : {'workflow_id': 'f2db41e1fa331b3e', 'ds_map': {'14': {'src': 'ld', 'id': '79966582feb6c081'}}, 'history': '141_S_0851_MRI_T2_Screening.nii.gz - apiFullCTE'}
{'outputs': ['19c51286b777bc04', '0f71f1fc170d4ab9', '256444f6e7017e58'], 'history': 'b701da857886499b'}
Traceback (most recent call last):
File "./example_watch_folder.py", line 89, in <module>
main(api_key, api_url, in_folder, out_folder, data_library, workflow )
File "./example_watch_folder.py", line 75, in main
shutil.move(fullpath, os.path.join(out_folder, fname))
File "/usr/lib/python2.7/shutil.py", line 299, in move
copy2(src, real_dst)
File "/usr/lib/python2.7/shutil.py", line 128, in copy2
copyfile(src, dst)
File "/usr/lib/python2.7/shutil.py", line 82, in copyfile
with open(src, 'rb') as fsrc:
IOError: [Errno 2] No such file or directory: '/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz'
It says there is no such file, but this file has already been copied from the input to the output directory. Any help much appreciated
Neil
------------------------------
Message: 2
Date: Mon, 29 Apr 2013 16:11:39 -0400
From: Rob Leclerc <robert.leclerc@gmail.com>
To: Dannon Baker <dannon.baker@gmail.com>
Cc: "galaxy-dev@bx.psu.edu" <galaxy-dev@bx.psu.edu>
Subject: Re: [galaxy-dev] Creating multiple datasets in a libset
Message-ID:
<CAGkd85fHSgO2YC1T+Frctyso9G5rfQb=_mLyHGSdxPM+s3=8pg@mail.gmail.com>
Content-Type: text/plain; charset="iso-8859-1"
Hi Dannon,
I've written some code to (i) query a dataset to ensure that it's been
uploaded after a submit and (ii) to ensure a resulting dataset has been
written to the file.
*#Block until all datasets have been uploaded*
libset = submit(api_key, api_url + "libraries/%s/contents" % library_id,
data, return_formatted = False)
for ds in libset:
while True:
uploaded_file = display(api_key, api_url +
'libraries/%s/contents/%s' %(library_id, ds['id']), return_formatted=False)
if uploaded_file['misc_info'] == None:
time.sleep(1)
else:
break
*#Block until all result datasets have been saved to the filesystem*
result_ds_url = api_url + 'histories/' + history_id + '/contents/' +
dsh['id'];
while True:
result_ds = display(api_key, result_ds_url, return_formatted=False)
if result_ds["state"] == 'ok':
break
else:
time.sleep(1)
Rob Leclerc, PhD
<http://www.linkedin.com/in/robleclerc> <https://twitter.com/#!/robleclerc>
P: (US) +1-(917)-873-3037
P: (Shanghai) +86-1-(861)-612-5469
Personal Email: rob.leclerc@aya.yale.edu
On Mon, Apr 29, 2013 at 11:18 AM, Dannon Baker <dannon.baker@gmail.com>wrote:
> Yep, that example filesystem_paths you suggest should work fine. The
> sleep() bit was a complete hack from the start, for simplicity in
> demonstrating a very basic pipeline, but what you probably want to do for a
> real implementation is query the dataset in question via the API, verify
> that the datatype/etc have been set, and only after that execute the
> workflow; instead of relying on sleep.
>
>
> On Mon, Apr 29, 2013 at 9:24 AM, Rob Leclerc <robert.leclerc@gmail.com>wrote:
>
>> Hi Dannon,
>>
>> Thanks for the response. Sorry to be pedantic, but just to make sure that
>> I understand the interpretation of this field on the other side of the API,
>> I would need to have something like the following:
>>
>> data['filesystem_paths'] = "/home/me/file1.vcf \n /home/me/file2.vcf /n
>> /home/me/file3.vcf"
>>
>> I assume I should also increase the time.sleep() to reflect the uploading
>> of extra files?
>>
>> Cheers,
>>
>> Rob
>>
>> Rob Leclerc, PhD
>> <http://www.linkedin.com/in/robleclerc><https://twitter.com/#!/robleclerc>
>> P: (US) +1-(917)-873-3037
>> P: (Shanghai) +86-1-(861)-612-5469
>> Personal Email: rob.leclerc@aya.yale.edu
>>
>>
>> On Mon, Apr 29, 2013 at 9:15 AM, Dannon Baker <dannon.baker@gmail.com>wrote:
>>
>>> Hey Rob,
>>>
>>> That example_watch_folder.py does just submit exactly one at a time,
>>> executes the workflow, and then does the next all in separate transactions.
>>> If you wanted to upload multiple filepaths at once, you'd just append more
>>> to the ''filesystem_paths' field (newline separated paths).
>>>
>>> -Dannon
>>>
>>>
>>> On Fri, Apr 26, 2013 at 11:54 PM, Rob Leclerc <robert.leclerc@gmail.com>wrote:
>>>
>>>> I'm looking at example_watch_folder.py and it's not clear from the
>>>> example how you submit multiple datasets to a library. In the example, the
>>>> first submit returns a libset [] with only a single entry and then proceeds
>>>> to iterate through each dataset in the libset in the following section:
>>>>
>>>> data = {}
>>>>
>>>> data['folder_id'] = library_folder_id
>>>>
>>>> data['file_type'] = 'auto'
>>>>
>>>> data['dbkey'] = ''
>>>>
>>>> data['upload_option'] = 'upload_paths'
>>>>
>>>>
>>>>
>>>> *data['filesystem_paths'] = fullpath*
>>>>
>>>> data['create_type'] = 'file'
>>>>
>>>> libset = submit(api_key, api_url + "libraries/%s/contents" %
>>>> library_id, data, return_formatted = False)
>>>>
>>>> time.sleep(5)
>>>>
>>>> for ds in libset:
>>>>
>>>> if 'id' in ds:
>>>>
>>>> wf_data = {}
>>>>
>>>> wf_data['workflow_id'] = workflow['id']
>>>>
>>>> wf_data['history'] = "%s - %s" % (fname,
>>>> workflow['name'])
>>>>
>>>> wf_data['ds_map'] = {}
>>>>
>>>> for step_id, ds_in in workflow['inputs'
>>>> ].iteritems():
>>>>
>>>> wf_data['ds_map'][step_id] = {'src':'ld',
>>>> 'id':ds['id']}
>>>>
>>>> res = submit( api_key, api_url + 'workflows',
>>>> wf_data, return_formatted=False)
>>>>
>>>>
>>>>
>>>> Rob Leclerc, PhD
>>>> <http://www.linkedin.com/in/robleclerc><https://twitter.com/#!/robleclerc>
>>>> P: (US) +1-(917)-873-3037
>>>> P: (Shanghai) +86-1-(861)-612-5469
>>>> Personal Email: rob.leclerc@aya.yale.edu
>>>>
>>>> ___________________________________________________________
>>>> Please keep all replies on the list by using "reply all"
>>>> in your mail client. To manage your subscriptions to this
>>>> and other Galaxy lists, please use the interface at:
>>>> http://lists.bx.psu.edu/
>>>>
>>>> To search Galaxy mailing lists use the unified search at:
>>>> http://galaxyproject.org/search/mailinglists/
>>>>
>>>
>>>
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.bx.psu.edu/pipermail/galaxy-dev/attachments/20130429/383c60a5/attachment-0001.html>