Hi Neil,

Is fullpaths a *new line* delimited string?

Rob Leclerc, PhD
P: (US) +1-(917)-873-3037
P: (Shanghai) +86-1-(861)-612-5469
Personal Email: rob.leclerc@aya.yale.edu


On May 30, 2013, at 1:09 AM, <Neil.Burdett@csiro.au> wrote:

Hi Rob,

        Thanks for the class I assume you created it in the “example_watch_folder.py” or whatever you may have renamed it too? Can you send me the full python script if possible ?

 

I modified example_watch_folder.py as followed (using your code):

 

if __name__ == '__main__':

    try:

        api_key = sys.argv[1]

        api_url = sys.argv[2]

        #in_folder = sys.argv[3]

        #out_folder = sys.argv[4]

        fullpaths = sys.argv[3]

        data_library = sys.argv[4]

        workflow = sys.argv[5]

    except IndexError:

        print 'usage: %s key url in_folder out_folder data_library workflow' % os.path.basename( sys.argv[0] )

        sys.exit( 1 )

    #main(api_key, api_url, in_folder, out_folder, data_library, workflow )

    main(api_key, api_url, fullpaths, data_library, workflow )

 

#def main(api_key, api_url, in_folder, out_folder, data_library, workflow):

def main(api_key, api_url, fullpaths, data_library, workflow):

...

 

while 1:

        #Galaxy needs to read the pathnames as a new line delimited string

        #so we do that transformation here

        print fullpaths

        fullpaths_string = ""

        for path in fullpaths:

            fullpaths_string = fullpaths_string + path + "\n"

           

        fullpaths_string = fullpaths_string[:-1]

        data = {}

        data['folder_id'] = library_folder_id

        data['file_type'] = 'auto'

        data['dbkey'] = ''

        data['upload_option'] = 'upload_paths'

        data['filesystem_paths'] = fullpaths_string

        data['create_type'] = 'file'

        print "before libset "

        #Start the upload. This will return right away, but it may take awhile

        libset = submit(api_key, api_url + "libraries/%s/contents" % library_id, data, return_formatted = False)

        print "after libset "

        #Iterate through each dataset we just uploaded and block until all files have been written to the Galaxy database

        for ds in libset:

            last_filesize = 0

            while True:

                #If file_size != 0 and the file_size is different after a second iteration, then we assume the disk write is finished

                ds_id = ds['id']

                uploaded_file = display(api_key, api_url + 'libraries/%s/contents/%s' %(library_id, ds_id), return_formatted=False)

                print uploaded_file

                if uploaded_file['file_size'] != 0 and uploaded_file['file_size'] == last_filesize:

                    break

                else:

                    last_filesize = uploaded_file['file_size']

                    time.sleep(2)

 

However, when I run this I get the following output i.e. there is a new line after each character should you not use os.path.dirname:

 

./milxview_watch_folder.py de5f19fcf64a47ca9b61cfc3bf41490c http://barium-rbh/csiro/api/ "/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T1_Screening.nii.gz,/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz" "This One" f2db41e1fa331b3e
/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T1_Screening.nii.gz,/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz
/
h
o
m
e
/
g
a
l
a
x
y
/
g
a
l
a
x
y
-
d
r
o
p
/
i
n
p
u
t
/
1
4
1
_
S
_
0
8
5
1
_
M
R
I
_
T
1
_
S
c
r
e
e
n
i
n
g
.
n
i
i
.
g
z
,
/
h
o
m
e
/
g
a
l
a
x
y
/
g
a
l
a
x
y
-
d
r
o
p
/
i
n
p
u
t
/
1
4
1
_
S
_
0
8
5
1
_
M
R
I
_
T
2
_
S
c
r
e
e
n
i
n
g
.
n
i
i
.
g
z

/
h
o
m
e
/
g
a
l
a
x
y
/
g
a
l
a
x
y
-
d
r
o
p
/
i
n
p
u
t
/
1
4
1
_
S
_
0
8
5
1
_
M
R
I
_
T
1
_
S
c
r
e
e
n
i
n
g
.
n
i
i
.
g
z
,
/
h
o
m
e
/
g
a
l
a
x
y
/
g
a
l
a
x
y
-
d
r
o
p
/
i
n
p
u
t
/
1
4
1
_
S
_
0
8
5
1
_
M
R
I
_
T
2
_
S
c
r
e
e
n
i
n
g
.
n
i
i
.
g
z
before libset
after libset
Traceback (most recent call last):
  File "./milxview_watch_folder.py", line 127, in <module>
    main(api_key, api_url, fullpaths, data_library, workflow )
  File "./milxview_watch_folder.py", line 70, in main
    ds_id = ds['id']
TypeError: string indices must be integers, not str

 

 

From: Rob Leclerc [mailto:robert.leclerc@gmail.com]
Sent: Wednesday, 29 May 2013 11:38 PM
To: Burdett, Neil (ICT Centre, Herston - RBWH)
Cc: galaxy-dev@lists.bx.psu.edu; Dannon Baker
Subject: Re: Creating multiple datasets in a libset

 

Hi Neil,

 

I've attached my class function for uploading multiple files. 

 

 def upload_files(self, fullpaths):

        """

            Uploads files from a disk location to a Galaxy library

            Accepts an array of full path filenames

            Example: fullpaths = ['/home/username/file1.txt', '/home/username/files2.txt']

        """

        if self.jsonstring == None:

            self.get_library()

            

        library_id = self.library_id

        library_folder_id = self.library_folder_id

        api_key = self.api_key

        api_url = self.api_url

        

        #Galaxy needs to read the pathnames as a new line delimited string

        #so we do that transformation here

        fullpaths_string = ""

        for path in fullpaths:

            fullpaths_string = fullpaths_string + path + "\n"

            

        fullpaths_string = fullpaths_string[:-1]

        data = {}

        data['folder_id'] = library_folder_id

        data['file_type'] = 'auto'

        data['dbkey'] = ''

        data['upload_option'] = 'upload_paths'

        data['filesystem_paths'] = fullpaths_string

        data['create_type'] = 'file'

        #Start the upload. This will return right away, but it may take awhile

        libset = submit(api_key, api_url + "libraries/%s/contents" % library_id, data, return_formatted = False)

        

        #Iterate through each dataset we just uploaded and block until all files have been written to the Galaxy database

        for ds in libset:

            last_filesize = 0

            while True:

                #If file_size != 0 and the file_size is different after a second iteration, then we assume the disk write is finished

                ds_id = ds['id']

                uploaded_file = display(api_key, api_url + 'libraries/%s/contents/%s' %(library_id, ds_id), return_formatted=False)

                print uploaded_file

                if uploaded_file['file_size'] != 0 and uploaded_file['file_size'] == last_filesize:

                    break

                else:

                    last_filesize = uploaded_file['file_size']

                    time.sleep(2)

        self.libset = libset

        return libset

 


Rob Leclerc, PhD

P: (US) +1-(917)-873-3037

P: (Shanghai) +86-1-(861)-612-5469

Personal Email: rob.leclerc@aya.yale.edu

 

On Wed, May 29, 2013 at 12:45 AM, <Neil.Burdett@csiro.au> wrote:

Hi Guys,
         Did you manage to get multiple datasets working? I can't seem to upload multiple files. Only the last file appears in the history. I changed my code as mentioned in the thread below in "example_watch_folder.py" to add multiple files separated by a new line and increased the sleep time:

for fname in os.listdir(in_folder):
            fullpath = os.path.join(in_folder, fname)
            print ' fullpath is [%s] ' % fullpath
            if os.path.isfile(fullpath):
                data = {}
                data['folder_id'] = library_folder_id
                data['file_type'] = 'auto'
                data['dbkey'] = ''
                data['upload_option'] = 'upload_paths'
                data['filesystem_paths'] = "/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T1_Screening.nii.gz\n /home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz"
                print ' data is [%s] ' % str(data['filesystem_paths'])
                data['create_type'] = 'file'
                libset = submit(api_key, api_url + "libraries/%s/contents" % library_id, data, return_formatted = False)
                #TODO Handle this better, but the datatype isn't always
                # set for the followup workflow execution without this
                # pause.
                time.sleep(65)

However, I get the following crash:

./example_watch_folder.py 64f3209856a3cf4f2d034a1ad5bf851c http://barium-rbh/csiro/api/ /home/galaxy/galaxy-drop/input /home/galaxy/galaxy-drop/output "This One" f2db41e1fa331b3e

 fullpath is [/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz]
 data is [/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T1_Screening.nii.gz
 /home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz]
url is : http://barium-rbh/csiro/api/libraries/33b43b4e7093c91f/contents?key=64f3209856a3cf4f2d034a1ad5bf851c
data is : {'file_type': 'auto', 'dbkey': '', 'create_type': 'file', 'folder_id': 'F33b43b4e7093c91f', 'upload_option': 'upload_paths', 'filesystem_paths': '/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T1_Screening.nii.gz\n /home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz'}
url is : http://barium-rbh/csiro/api/workflows?key=64f3209856a3cf4f2d034a1ad5bf851c
data is : {'workflow_id': 'f2db41e1fa331b3e', 'ds_map': {'14': {'src': 'ld', 'id': 'ff5476bcf6c921fa'}}, 'history': '141_S_0851_MRI_T2_Screening.nii.gz - apiFullCTE'}
{'outputs': ['daecbdd824e1c349', '358eb58cd5463e0d', 'c0279aab05812500'], 'history': '3cc0effd29705aa3'}
url is : http://barium-rbh/csiro/api/workflows?key=64f3209856a3cf4f2d034a1ad5bf851c
data is : {'workflow_id': 'f2db41e1fa331b3e', 'ds_map': {'14': {'src': 'ld', 'id': '79966582feb6c081'}}, 'history': '141_S_0851_MRI_T2_Screening.nii.gz - apiFullCTE'}
{'outputs': ['19c51286b777bc04', '0f71f1fc170d4ab9', '256444f6e7017e58'], 'history': 'b701da857886499b'}
Traceback (most recent call last):
  File "./example_watch_folder.py", line 89, in <module>
    main(api_key, api_url, in_folder, out_folder, data_library, workflow )
  File "./example_watch_folder.py", line 75, in main
    shutil.move(fullpath, os.path.join(out_folder, fname))
  File "/usr/lib/python2.7/shutil.py", line 299, in move
    copy2(src, real_dst)
  File "/usr/lib/python2.7/shutil.py", line 128, in copy2
    copyfile(src, dst)
  File "/usr/lib/python2.7/shutil.py", line 82, in copyfile
    with open(src, 'rb') as fsrc:
IOError: [Errno 2] No such file or directory: '/home/galaxy/galaxy-drop/input/141_S_0851_MRI_T2_Screening.nii.gz'

It says there is no such file, but this file has already been copied from the input to the output directory. Any help much appreciated

Neil

------------------------------

Message: 2
Date: Mon, 29 Apr 2013 16:11:39 -0400
From: Rob Leclerc <robert.leclerc@gmail.com>
To: Dannon Baker <dannon.baker@gmail.com>
Cc: "galaxy-dev@bx.psu.edu" <galaxy-dev@bx.psu.edu>
Subject: Re: [galaxy-dev] Creating multiple datasets in a libset
Message-ID:
        <CAGkd85fHSgO2YC1T+Frctyso9G5rfQb=_mLyHGSdxPM+s3=8pg@mail.gmail.com>
Content-Type: text/plain; charset="iso-8859-1"

Hi Dannon,

I've written some code to (i) query a dataset to ensure that it's been
uploaded after a submit and (ii) to ensure a resulting dataset has been
written to the file.

*#Block until all datasets have been uploaded*
libset = submit(api_key, api_url + "libraries/%s/contents" % library_id,
data, return_formatted = False)
for ds in libset:
    while True:
        uploaded_file = display(api_key, api_url +
'libraries/%s/contents/%s' %(library_id, ds['id']), return_formatted=False)
        if uploaded_file['misc_info'] == None:
            time.sleep(1)
        else:
            break

*#Block until all result datasets have been saved to the filesystem*
result_ds_url = api_url + 'histories/' + history_id + '/contents/' +
dsh['id'];
while True:
    result_ds = display(api_key, result_ds_url, return_formatted=False)
        if result_ds["state"] == 'ok':
            break
        else:
            time.sleep(1)


Rob Leclerc, PhD
<http://www.linkedin.com/in/robleclerc> <https://twitter.com/#!/robleclerc>
P: (US) +1-(917)-873-3037
P: (Shanghai) +86-1-(861)-612-5469
Personal Email: rob.leclerc@aya.yale.edu


On Mon, Apr 29, 2013 at 11:18 AM, Dannon Baker <dannon.baker@gmail.com>wrote:

> Yep, that example filesystem_paths you suggest should work fine.  The
> sleep() bit was a complete hack from the start, for simplicity in
> demonstrating a very basic pipeline, but what you probably want to do for a
> real implementation is query the dataset in question via the API, verify
> that the datatype/etc have been set, and only after that execute the
> workflow; instead of relying on sleep.
>
>
> On Mon, Apr 29, 2013 at 9:24 AM, Rob Leclerc <robert.leclerc@gmail.com>wrote:
>
>> Hi Dannon,
>>
>> Thanks for the response. Sorry to be pedantic, but just to make sure that
>> I understand the interpretation of this field on the other side of the API,
>> I would need to have something like the following:
>>
>> data['filesystem_paths'] = "/home/me/file1.vcf \n /home/me/file2.vcf /n
>> /home/me/file3.vcf"
>>
>> I assume I should also increase the time.sleep() to reflect the uploading
>> of extra files?
>>
>> Cheers,
>>
>> Rob
>>
>> Rob Leclerc, PhD
>> <http://www.linkedin.com/in/robleclerc><https://twitter.com/#!/robleclerc>
>> P: (US) +1-(917)-873-3037
>> P: (Shanghai) +86-1-(861)-612-5469
>> Personal Email: rob.leclerc@aya.yale.edu
>>
>>
>> On Mon, Apr 29, 2013 at 9:15 AM, Dannon Baker <dannon.baker@gmail.com>wrote:
>>
>>> Hey Rob,
>>>
>>> That example_watch_folder.py does just submit exactly one at a time,
>>> executes the workflow, and then does the next all in separate transactions.
>>>  If you wanted to upload multiple filepaths at once, you'd just append more
>>> to the ''filesystem_paths' field (newline separated paths).
>>>
>>> -Dannon
>>>
>>>
>>> On Fri, Apr 26, 2013 at 11:54 PM, Rob Leclerc <robert.leclerc@gmail.com>wrote:
>>>
>>>> I'm looking at example_watch_folder.py and it's not clear from the
>>>> example how you submit multiple datasets to a library. In the example, the
>>>> first submit returns a libset [] with only a single entry and then proceeds
>>>> to iterate through each dataset in the libset in the following section:
>>>>
>>>> data = {}
>>>>
>>>>    data['folder_id'] = library_folder_id
>>>>
>>>>    data['file_type'] = 'auto'
>>>>
>>>>    data['dbkey'] = ''
>>>>
>>>>    data['upload_option'] = 'upload_paths'
>>>>
>>>>
>>>>
>>>> *data['filesystem_paths'] = fullpath*
>>>>
>>>>    data['create_type'] = 'file'
>>>>
>>>>    libset = submit(api_key, api_url + "libraries/%s/contents" %
>>>> library_id, data, return_formatted = False)
>>>>
>>>>    time.sleep(5)
>>>>
>>>>    for ds in libset:
>>>>
>>>>        if 'id' in ds:
>>>>
>>>>                         wf_data = {}
>>>>
>>>>                         wf_data['workflow_id'] = workflow['id']
>>>>
>>>>                         wf_data['history'] = "%s - %s" % (fname,
>>>> workflow['name'])
>>>>
>>>>                         wf_data['ds_map'] = {}
>>>>
>>>>                         for step_id, ds_in in workflow['inputs'
>>>> ].iteritems():
>>>>
>>>>                             wf_data['ds_map'][step_id] = {'src':'ld',
>>>> 'id':ds['id']}
>>>>
>>>>                         res = submit( api_key, api_url + 'workflows',
>>>> wf_data, return_formatted=False)
>>>>
>>>>
>>>>
>>>> Rob Leclerc, PhD
>>>> <http://www.linkedin.com/in/robleclerc><https://twitter.com/#!/robleclerc>
>>>> P: (US) +1-(917)-873-3037
>>>> P: (Shanghai) +86-1-(861)-612-5469
>>>> Personal Email: rob.leclerc@aya.yale.edu
>>>>
>>>> ___________________________________________________________
>>>> Please keep all replies on the list by using "reply all"
>>>> in your mail client.  To manage your subscriptions to this
>>>> and other Galaxy lists, please use the interface at:
>>>>   http://lists.bx.psu.edu/
>>>>
>>>> To search Galaxy mailing lists use the unified search at:
>>>>   http://galaxyproject.org/search/mailinglists/
>>>>
>>>
>>>
>>
>
-------------- next part --------------
An HTML attachment was scrubbed...
URL: <http://lists.bx.psu.edu/pipermail/galaxy-dev/attachments/20130429/383c60a5/attachment-0001.html>