Skip to content

Commit dc0df47

Browse files
committed
Match gsutil behaviour
1 parent 8f7abd2 commit dc0df47

File tree

4 files changed

+29
-36
lines changed

4 files changed

+29
-36
lines changed

sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/extensions/gcp/storage/GcsFileSystem.java

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -73,7 +73,7 @@ protected List<MatchResult> match(List<String> specs) throws IOException {
7373
List<Boolean> isGlobBooleans = Lists.newArrayList();
7474

7575
for (GcsPath path : gcsPaths) {
76-
if (GcsUtil.isGlob(path)) {
76+
if (GcsUtil.isWildcard(path)) {
7777
globs.add(path);
7878
isGlobBooleans.add(true);
7979
} else {
@@ -178,8 +178,8 @@ public MatchResult apply(GcsPath gcsPath) {
178178
*/
179179
@VisibleForTesting
180180
MatchResult expand(GcsPath gcsPattern) throws IOException {
181-
String prefix = GcsUtil.getGlobPrefix(gcsPattern.getObject());
182-
Pattern p = Pattern.compile(GcsUtil.globToRegexp(gcsPattern.getObject()));
181+
String prefix = GcsUtil.getNonWildcardPrefix(gcsPattern.getObject());
182+
Pattern p = Pattern.compile(GcsUtil.wildcardToRegexp(gcsPattern.getObject()));
183183

184184
LOG.debug("matching files in bucket {}, prefix {} against pattern {}", gcsPattern.getBucket(),
185185
prefix, p.toString());

sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/extensions/gcp/storage/GcsPathValidator.java

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,6 @@
2323
import org.apache.beam.sdk.extensions.gcp.options.GcsOptions;
2424
import org.apache.beam.sdk.io.fs.ResourceId;
2525
import org.apache.beam.sdk.options.PipelineOptions;
26-
import org.apache.beam.sdk.util.GcsUtil;
2726
import org.apache.beam.sdk.util.gcsfs.GcsPath;
2827

2928
/**
@@ -47,7 +46,7 @@ public static GcsPathValidator fromOptions(PipelineOptions options) {
4746
*/
4847
@Override
4948
public void validateInputFilePatternSupported(String filepattern) {
50-
GcsPath gcsPath = getGcsPath(filepattern);
49+
getGcsPath(filepattern);
5150
verifyPath(filepattern);
5251
verifyPathIsAccessible(filepattern, "Could not find file %s");
5352
}

sdks/java/extensions/google-cloud-platform-core/src/main/java/org/apache/beam/sdk/util/GcsUtil.java

Lines changed: 12 additions & 20 deletions
Original file line numberDiff line numberDiff line change
@@ -125,14 +125,6 @@ public static GcsUtil create(
125125
/** Matches a glob containing a wildcard, capturing the portion before the first wildcard. */
126126
private static final Pattern GLOB_PREFIX = Pattern.compile("(?<PREFIX>[^\\[*?]*)[\\[*?].*");
127127

128-
private static final String RECURSIVE_WILDCARD = "[*]{2}";
129-
130-
/**
131-
* A {@link Pattern} for globs with a recursive wildcard.
132-
*/
133-
private static final Pattern RECURSIVE_GCS_PATTERN =
134-
Pattern.compile(".*" + RECURSIVE_WILDCARD + ".*");
135-
136128
/**
137129
* Maximum number of requests permitted in a GCS batch request.
138130
*/
@@ -162,7 +154,7 @@ public static GcsUtil create(
162154
/**
163155
* Returns the prefix portion of the glob that doesn't contain wildcards.
164156
*/
165-
public static String getGlobPrefix(String globExp) {
157+
public static String getNonWildcardPrefix(String globExp) {
166158
Matcher m = GLOB_PREFIX.matcher(globExp);
167159
checkArgument(
168160
m.matches(),
@@ -176,9 +168,9 @@ public static String getGlobPrefix(String globExp) {
176168
* @param globExp the glob expression to expand
177169
* @return a string with the regular expression this glob expands to
178170
*/
179-
public static String globToRegexp(String globExp) {
171+
public static String wildcardToRegexp(String globExp) {
180172
StringBuilder dst = new StringBuilder();
181-
char[] src = globExp.toCharArray();
173+
char[] src = globExp.replace("**/*", "**").toCharArray();
182174
int i = 0;
183175
while (i < src.length) {
184176
char c = src[i++];
@@ -187,7 +179,7 @@ public static String globToRegexp(String globExp) {
187179
dst.append(".*");
188180
break;
189181
case '?':
190-
dst.append(".");
182+
dst.append("[^/]");
191183
break;
192184
case '.':
193185
case '+':
@@ -213,9 +205,9 @@ public static String globToRegexp(String globExp) {
213205
}
214206

215207
/**
216-
* Returns true if the given {@code spec} contains glob.
208+
* Returns true if the given {@code spec} contains wildcard.
217209
*/
218-
public static boolean isGlob(GcsPath spec) {
210+
public static boolean isWildcard(GcsPath spec) {
219211
return GLOB_PREFIX.matcher(spec.getObject()).matches();
220212
}
221213

@@ -243,8 +235,12 @@ protected void setStorageClient(Storage storageClient) {
243235
public List<GcsPath> expand(GcsPath gcsPattern) throws IOException {
244236
Pattern p = null;
245237
String prefix = null;
246-
if (!isGlob(gcsPattern)) {
247-
// Not a glob.
238+
if (isWildcard(gcsPattern)) {
239+
// Part before the first wildcard character.
240+
prefix = getNonWildcardPrefix(gcsPattern.getObject());
241+
p = Pattern.compile(wildcardToRegexp(gcsPattern.getObject()));
242+
} else {
243+
// Not a wildcard.
248244
try {
249245
// Use a get request to fetch the metadata of the object, and ignore the return value.
250246
// The request has strong global consistency.
@@ -254,10 +250,6 @@ public List<GcsPath> expand(GcsPath gcsPattern) throws IOException {
254250
// If the path was not found, return an empty list.
255251
return ImmutableList.of();
256252
}
257-
} else {
258-
// Part before the first wildcard character.
259-
prefix = getGlobPrefix(gcsPattern.getObject());
260-
p = Pattern.compile(globToRegexp(gcsPattern.getObject()));
261253
}
262254

263255
LOG.debug("matching files in bucket {}, prefix {} against pattern {}", gcsPattern.getBucket(),

sdks/java/extensions/google-cloud-platform-core/src/test/java/org/apache/beam/sdk/util/GcsUtilTest.java

Lines changed: 13 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -92,10 +92,10 @@ public class GcsUtilTest {
9292

9393
@Test
9494
public void testGlobTranslation() {
95-
assertEquals("foo", GcsUtil.globToRegexp("foo"));
96-
assertEquals("fo.*o", GcsUtil.globToRegexp("fo*o"));
97-
assertEquals("f.*o\\..", GcsUtil.globToRegexp("f*o.?"));
98-
assertEquals("foo-[0-9].*", GcsUtil.globToRegexp("foo-[0-9]*"));
95+
assertEquals("foo", GcsUtil.wildcardToRegexp("foo"));
96+
assertEquals("fo.*o", GcsUtil.wildcardToRegexp("fo*o"));
97+
assertEquals("f.*o\\.[^/]", GcsUtil.wildcardToRegexp("f*o.?"));
98+
assertEquals("foo-[0-9].*", GcsUtil.wildcardToRegexp("foo-[0-9]*"));
9999
}
100100

101101
private static GcsOptions gcsOptionsWithTestCredential() {
@@ -278,11 +278,12 @@ public void testRecursiveGlobExpansion() throws IOException {
278278
items.add(new StorageObject().setBucket("testbucket").setName("testdirectory/"));
279279

280280
// Files within the directory
281-
items.add(new StorageObject().setBucket("testbucket").setName("test/directory/file1name"));
282-
items.add(new StorageObject().setBucket("testbucket").setName("test/directory/file2name"));
283-
items.add(new StorageObject().setBucket("testbucket").setName("test/directory/file3name"));
281+
items.add(new StorageObject().setBucket("testbucket").setName("test/directory/file1.txt"));
282+
items.add(new StorageObject().setBucket("testbucket").setName("test/directory/file2.txt"));
283+
items.add(new StorageObject().setBucket("testbucket").setName("test/directory/file3.txt"));
284284
items.add(new StorageObject().setBucket("testbucket").setName("test/directory/otherfile"));
285285
items.add(new StorageObject().setBucket("testbucket").setName("test/directory/anotherfile"));
286+
items.add(new StorageObject().setBucket("testbucket").setName("test/file4.txt"));
286287

287288
modelObjects.setItems(items);
288289

@@ -295,11 +296,12 @@ public void testRecursiveGlobExpansion() throws IOException {
295296
when(mockStorageList.execute()).thenReturn(modelObjects);
296297

297298
{
298-
GcsPath pattern = GcsPath.fromUri("gs://testbucket/test**/fi*name");
299+
GcsPath pattern = GcsPath.fromUri("gs://testbucket/test/**/*.txt");
299300
List<GcsPath> expectedFiles = ImmutableList.of(
300-
GcsPath.fromUri("gs://testbucket/test/directory/file1name"),
301-
GcsPath.fromUri("gs://testbucket/test/directory/file2name"),
302-
GcsPath.fromUri("gs://testbucket/test/directory/file3name"));
301+
GcsPath.fromUri("gs://testbucket/test/directory/file1.txt"),
302+
GcsPath.fromUri("gs://testbucket/test/directory/file2.txt"),
303+
GcsPath.fromUri("gs://testbucket/test/directory/file3.txt"),
304+
GcsPath.fromUri("gs://testbucket/test/file4.txt"));
303305

304306
assertThat(expectedFiles, contains(gcsUtil.expand(pattern).toArray()));
305307
}

0 commit comments

Comments
 (0)